Use scons to generate prebuilds This is a manual step for merging upstream. scons is used to generate a bunch of cl kernels, and scons is not present as an Android.bp, so this was ran manually. Test: n/a Change-Id: Ia2631bc30c4ed59c73230a89b0319fd1e828041c

commit: eb0170eb99a8d03384e87ff20a38f76ce16de252 [log] [tgz]
author: Kevin DuBois <kevindubois@google.com> Thu Aug 06 09:13:53 2020 -0700
committer: Kevin DuBois <kevindubois@google.com> Fri Aug 21 19:35:04 2020 -0700
tree: 0d466fae5e5ab00e7bb66290fb909f3af9620196
parent: 15cb0bc3d410187173f9f6179dca07725cfd1c08 [diff]
diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/absdiff.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/absdiff.clembed
new file mode 100644
index 0000000..bb7ab7b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/absdiff.clembed

@@ -0,0 +1,608 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void absdiff(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/accumulate.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/accumulate.clembed
new file mode 100644
index 0000000..4248df0
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/accumulate.clembed

@@ -0,0 +1,673 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function accumulates an input image into output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ */
+__kernel void accumulate(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu))
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    uchar16 in_data   = vload16(0, input.ptr);
+    short16 accu_data = vload16(0, (__global short *)accu.ptr);
+
+    // Perform accumulation
+    short16 res = add_sat(convert_short16(in_data), accu_data);
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
+
+/** This function accumulates a weighted value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
+ */
+__kernel void accumulate_weighted(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const float alpha)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    const float16 in_data   = convert_float16(vload16(0, input.ptr));
+    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
+
+    // Calculate weighted accumulation
+    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
+
+    // Store result
+    vstore16(res, 0, accu.ptr);
+}
+
+/** This function accumulates a squared value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
+ */
+__kernel void accumulate_squared(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const uint shift)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
+    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
+
+    // Calculate squared accumulation
+    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/activation_float_helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_float_helpers.hembed
new file mode 100644
index 0000000..661f879
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_float_helpers.hembed

@@ -0,0 +1,616 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer.clembed
new file mode 100644
index 0000000..820c82b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer.clembed

@@ -0,0 +1,698 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION(ACT, DATA_TYPE, data, A_VAL, B_VAL);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#endif /* defined(ACT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer_quant.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer_quant.clembed
new file mode 100644
index 0000000..bfa3e44
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_layer_quant.clembed

@@ -0,0 +1,1395 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(S1_VAL) && !defined(S2_VAL)
+#define S2_VAL S1_VAL
+#endif // defined(S1_VAL) && !defined(S2_VAL)
+#if defined(O1_VAL) && !defined(O2_VAL)
+#define O2_VAL O1_VAL
+#endif // defined(O1_VAL) && !defined(O2_VAL)
+
+// RELU Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max((TYPE)CONST_0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max((TYPE)CONST_0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(S1_VAL) && defined(S2_VAL)
+#if defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
+    ({                                                                                            \
+        data = ACTIVATION_OP(act, data);                                                          \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
+        \
+        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
+    })
+#else // defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                             \
+    ({                                                                  \
+        data = ACTIVATION_OP(act, data);                                \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                  \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
+        \
+        fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
+    })
+#endif /* defined(O1_VAL) && defined(O2_VAL) */
+#else  /* defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QUANT(act, data) \
+    ({                                      \
+        data = ACTIVATION_OP(act, data);    \
+    })
+#endif /* defined(S1_VAL) && defined(S2_VAL) */
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+
+#if defined(FLOAT_DOMAIN)
+// Activations performed in the float domain
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+/** This performs an activation function on quantized inputs with float transformations.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant_f32(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+#if defined(O1_VAL)
+    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+#else  // defined(O1_VAL)
+    data_flt        = round(data_flt) * ((float)S1_VAL);
+#endif // defined(O1_VAL)
+    data_flt = ACTIVATION(ACT, float, data_flt, A_VAL, B_VAL);
+
+#if defined(O2_VAL)
+    data = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+#else  // defined(O2_VAL)
+    data            = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+#endif // defined(O2_VAL)
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#else // defined(FLOAT_DOMAIN)
+// Activations performed in the quantized domain
+
+#if defined(ACT)
+/** This performs an activation function on quantized inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    data = PERFORM_ACTIVATION_QUANT(ACT, data);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(ACT)
+#endif // defined(FLOAT_DOMAIN)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/activation_quant_helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_quant_helpers.hembed
new file mode 100644
index 0000000..48ef641
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/activation_quant_helpers.hembed

@@ -0,0 +1,627 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(S1_VAL) && !defined(S2_VAL)
+#define S2_VAL S1_VAL
+#endif // defined(S1_VAL) && !defined(S2_VAL)
+#if defined(O1_VAL) && !defined(O2_VAL)
+#define O2_VAL O1_VAL
+#endif // defined(O1_VAL) && !defined(O2_VAL)
+
+// RELU Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max((TYPE)CONST_0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max((TYPE)CONST_0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(S1_VAL) && defined(S2_VAL)
+#if defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
+    ({                                                                                            \
+        data = ACTIVATION_OP(act, data);                                                          \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
+        \
+        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
+    })
+#else // defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                             \
+    ({                                                                  \
+        data = ACTIVATION_OP(act, data);                                \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                  \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
+        \
+        fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
+    })
+#endif /* defined(O1_VAL) && defined(O2_VAL) */
+#else  /* defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QUANT(act, data) \
+    ({                                      \
+        data = ACTIVATION_OP(act, data);    \
+    })
+#endif /* defined(S1_VAL) && defined(S2_VAL) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/arg_min_max.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/arg_min_max.clembed
new file mode 100644
index 0000000..de9c054
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/arg_min_max.clembed

@@ -0,0 +1,992 @@
+R"(
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
+#define ISLESS(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
+#if defined(WIDTH)
+#if defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
+{
+    int end_elem = (x_idx + 1) * 16;
+    if(end_elem > WIDTH)
+    {
+        end_elem = WIDTH - x_idx * 16;
+    }
+    DATA_TYPE_OUTPUT res = prev_res[0];
+    for(int x_v = 1; x_v < end_elem; ++x_v)
+    {
+        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
+    }
+    return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) < * (input + res));
+    }
+    return res;
+#else  // WIDTH >= 16
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+    idx_sel       = (in.s01234567 <= in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MIN)
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
+{
+    int end_elem = (x_idx + 1) * 16;
+    if(end_elem > WIDTH)
+    {
+        end_elem = WIDTH - x_idx * 16;
+    }
+    DATA_TYPE_OUTPUT res = prev_res[0];
+    for(int x_v = 1; x_v < end_elem; ++x_v)
+    {
+        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
+    }
+    return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) > *(input + res));
+    }
+    return res;
+#else  // WIDTH >= 16
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+    idx_sel       = (in.s01234567 >= in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX)
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
+ * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[in] prev_res_ptr                              (Optional) Pointer to previous results tensor. Supported data types: U32/S32
+ * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X dimension (in bytes)
+ * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element in the previous results tensor
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: U32/S32
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
+ */
+__kernel void arg_min_max_x(
+    IMAGE_DECLARATION(src),
+#if defined(PREV_OUTPUT)
+    IMAGE_DECLARATION(prev_res),
+#endif // defined(PREV_OUTPUT)
+    IMAGE_DECLARATION(partial_res),
+    __local DATA_TYPE_OUTPUT *local_results)
+{
+#if defined(PREV_OUTPUT)
+    Image src      = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
+#else  // !defined(PREV_OUTPUT)
+    Image src                      = CONVERT_TO_IMAGE_STRUCT(src);
+#endif // defined(PREV_OUTPUT)
+    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+    unsigned int lsize = get_local_size(0);
+    unsigned int lid   = get_local_id(0);
+
+    const uint     x_idx                 = get_global_id(0);
+    const uint     y_idx                 = get_global_id(1);
+    const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
+
+    for(unsigned int y = 0; y < get_local_size(1); ++y)
+    {
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#else  // defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+        local_results[lid]         = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Looking for the next highest power of 2 (maximum value of lsize is 8)
+        unsigned int middle = lsize - 1;
+        middle |= middle >> 1;
+        middle |= middle >> 2;
+        middle += 1;
+        // Perform parallel reduction
+        for(unsigned int i = middle; i > 0; i >>= 1)
+        {
+            if( lid < i && lid + i < lsize)
+            {
+                DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
+                DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
+#if defined(ARG_MAX)
+                local_results[lid] = select(
+                                         local_results[lid],
+                                         local_results[lid + i],
+                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
+#else  // defined(ARG_MIN)
+                local_results[lid] = select(
+                                         local_results[lid],
+                                         local_results[lid + i],
+                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lid == 0)
+        {
+            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+        }
+    }
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(output))
+{
+    Image src    = CONVERT_TO_IMAGE_STRUCT(src);
+    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    indx = 0;
+    for(unsigned int y = 1; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+        indx      = select(indx, y, cond_conv);
+        res       = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    indx = 0;
+    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+        indx      = select(indx, z, cond_conv);
+        res       = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    indx = 0;
+    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+        indx      = select(indx, w, cond_conv);
+        res       = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/batch_to_space.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/batch_to_space.clembed
new file mode 100644
index 0000000..ef5cfe7
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/batch_to_space.clembed

@@ -0,0 +1,775 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/batchnormalization_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/batchnormalization_layer.clembed
new file mode 100644
index 0000000..f0520c4
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/batchnormalization_layer.clembed

@@ -0,0 +1,1573 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(2);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res          = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(0);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0 = 0;
+#if defined(NHWC)
+    int c1 = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/bitwise_op.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/bitwise_op.clembed
new file mode 100644
index 0000000..c5c923e
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/bitwise_op.clembed

@@ -0,0 +1,702 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a | in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a & in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a ^ in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise NOT of an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_data = vload16(0, in.ptr);
+
+    vstore16(~in_data, 0, out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform.clembed
new file mode 100644
index 0000000..5d0aad6
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform.clembed

@@ -0,0 +1,666 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: F16/F32
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: same as @p in_ptr
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const DATA_TYPE one     = (DATA_TYPE)1.f;
+    const DATA_TYPE halfone = (DATA_TYPE)0.5f;
+
+    const int py = get_global_id(1); // box
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    dims = box.s23 - box.s01 + one;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    ctr = box.s01 + halfone * dims;
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_ctr = delta.s01 * dims + ctr;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+
+#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform_quantized.clembed
new file mode 100644
index 0000000..6b0f146
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/bounding_box_transform_quantized.clembed

@@ -0,0 +1,1142 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+#if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor for quantized data types. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: QASYMM16
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: QASYMM8
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform_quantized(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const float one     = 1.f;
+    const float halfone = 0.5f;
+
+    const int py           = get_global_id(1); // box
+    float4    scale_before = (float4)SCALE_BEFORE;
+    float4 delta           = DEQUANTIZE(vload4(0, (__global DATA_TYPE_DELTAS *)deltas.ptr), OFFSET_DELTAS, SCALE_DELTAS, DATA_TYPE_DELTAS, 4);
+    float4 box             = DEQUANTIZE(vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)), OFFSET_BOXES, SCALE_BOXES, DATA_TYPE, 4) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    float2 dims    = box.s23 - box.s01 + one;
+    float2 ctr     = box.s01 + halfone * dims;
+    float4 weights = (float4)(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (float)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    float2 pred_ctr  = delta.s01 * dims + ctr;
+    float2 pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    float4 max_values = (float4)(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    float4 sign       = (float4)(-1, -1, 1, 1);
+    float4 min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    float4 pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (float4)SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(QUANTIZE(pred_box, OFFSET_PRED_BOXES, SCALE_PRED_BOXES, DATA_TYPE, 4), 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+#endif // Check for compile time constants
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/canny.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/canny.clembed
new file mode 100644
index 0000000..34223e8
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/canny.clembed

@@ -0,0 +1,997 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 1 normalisation.
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L1(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
+
+    /* Calculate the gradient, using level 1 normalisation method */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
+    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
+
+    /* Calculate the angle */
+    float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h));
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 180.0f, p < 0.0f);
+
+    /* Store results */
+    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 2 normalisation
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L2(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
+    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
+
+    /* Calculate the gradient, using level 2 normalisation method */
+    float4 m = sqrt(h * h + v * v);
+
+    /* Calculate the angle */
+    float4 p = 180.0f * atan2pi(v, h);
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 180.0f, p < 0.0f);
+
+    /* Store results */
+    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+#define EDGE 255
+#define NO_EDGE 0
+
+/** Array that holds the relative coordinates offset for the neighbouring pixels.
+ */
+__constant short4 neighbours_coords[] =
+{
+    { -1, 0, 1, 0 },  // 0
+    { -1, -1, 1, 1 }, // 45
+    { 0, -1, 0, 1 },  // 90
+    { 1, -1, -1, 1 }, // 135
+};
+
+/** Perform non maximum suppression.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
+ * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
+ * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
+ * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
+ * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  lower_thr                             The low threshold
+ */
+__kernel void suppress_non_maximum(
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle),
+    IMAGE_DECLARATION(non_max),
+    uint lower_thr)
+{
+    // Construct images
+    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
+    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
+
+    // Index
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Get gradient and angle
+    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
+    uchar an              = *((__global uchar *)angle.ptr);
+
+    // Early return if not greater than lower threshold
+    if(gradient <= lower_thr)
+    {
+        return;
+    }
+
+    // Divide the whole round into 4 directions
+    DATA_TYPE_OUT q_an;
+
+    if(an < 22.5f || an >= 157.5f)
+    {
+        q_an = 0;
+    }
+    else if(an < 67.5f)
+    {
+        q_an = 1;
+    }
+    else if(an < 112.5f)
+    {
+        q_an = 2;
+    }
+    else
+    {
+        q_an = 3;
+    }
+
+    // Find the two pixels in the perpendicular direction
+    short2       x_p = neighbours_coords[q_an].s02;
+    short2       y_p = neighbours_coords[q_an].s13;
+    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
+    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
+
+    if((gradient > g1) && (gradient > g2))
+    {
+        __global uchar *non_max_addr            = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y;
+        *((global DATA_TYPE_OUT *)non_max_addr) = gradient;
+    }
+}
+
+#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
+
+/** Check whether pixel is valid
+ *
+ * Skip the pixel if the early_test fails.
+ * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
+ *
+ * @param[in] early_test Boolean condition based on the minv check and visited buffer check
+ * @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
+ * @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
+ * @param[in] x_cur      X-coordinate of current central pixel
+ * @param[in] y_cur      Y-coordinate of current central pixel
+ */
+#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
+    {                                                                                     \
+        if(!early_test)                                                                   \
+        {                                                                                 \
+            /* Number of elements in the local stack 1, points to next available entry */ \
+            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
+            \
+            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
+                goto pop_stack;                                                           \
+            \
+            /* The pixel that has already been recorded is ignored */                     \
+            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
+            {                                                                             \
+                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
+                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
+            }                                                                             \
+        }                                                                                 \
+    }
+
+/** Perform hysteresis.
+ *
+ * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
+ *
+ * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
+ * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
+ * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
+ * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
+ * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
+ * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
+ * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
+ * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
+ * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
+ * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  low_thr                                        The lower threshold
+ * @param[in]  up_thr                                         The upper threshold
+ * @param[in]  width                                          The width of the image.
+ * @param[in]  height                                         The height of the image
+ */
+kernel void hysteresis(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(visited),
+    IMAGE_DECLARATION(recorded),
+    IMAGE_DECLARATION(l1_stack),
+    IMAGE_DECLARATION(l1_stack_counter),
+    uint low_thr,
+    uint up_thr,
+    int  width,
+    int  height)
+{
+    // Create images
+    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
+    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
+    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
+    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
+    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
+
+    // Index
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Load value
+    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
+
+    // If the pixel has already been marked as NO_EDGE, store that value in the output and return
+    if(val == NO_EDGE)
+    {
+        *offset(&out, x, y) = NO_EDGE;
+        return;
+    }
+
+    // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge
+    if(val <= up_thr)
+    {
+        return;
+    }
+
+    // Init local stack 2
+    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
+    int    L2_counter                          = 0;
+
+    // Perform recursive hysteresis
+    while(true)
+    {
+        // Get L1 stack pointer
+        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
+
+        // If the pixel has already been visited, proceed with the items in the stack instead
+        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
+        {
+            goto pop_stack;
+        }
+
+        // Set strong edge
+        *offset(&out, x, y) = EDGE;
+
+        // If it is the top of stack l2, we don't need check the surrounding pixels
+        if(L2_counter > (hysteresis_local_stack_L2 - 1))
+        {
+            goto pop_stack2;
+        }
+
+        // Points to the start of the local stack;
+        char c;
+
+        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+        x_tmp;
+        uint4 v_tmp;
+
+        // Get direction pixel indices
+        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
+
+        // Check 8 pixels around for weak edges where low_thr < val <= up_thr
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
+
+#undef check_pixel
+
+pop_stack:
+        c = *((__global char *)offset(&l1_stack_counter, x, y));
+
+        if(c >= 1)
+        {
+            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
+            int2 l_c = convert_int2(l1_ptr[c - 1]);
+
+            // Push the current position into level 2 stack
+            stack_L2[L2_counter].x = x;
+            stack_L2[L2_counter].y = y;
+
+            x = l_c.x;
+            y = l_c.y;
+
+            L2_counter++;
+
+            continue;
+        }
+
+        if(L2_counter > 0)
+        {
+            goto pop_stack2;
+        }
+        else
+        {
+            return;
+        }
+
+pop_stack2:
+        L2_counter--;
+        x = stack_L2[L2_counter].x;
+        y = stack_L2[L2_counter].y;
+    };
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/channel_combine.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_combine.clembed
new file mode 100644
index 0000000..b8cda7f
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_combine.clembed

@@ -0,0 +1,959 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function combines three planes to a single RGB image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGB888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
+                             data0.s1, data1.s1, data2.s1,
+                             data0.s2, data1.s2, data2.s2,
+                             data0.s3, data1.s3, data2.s3,
+                             data0.s4, data1.s4, data2.s4,
+                             data0.s5);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
+                             data1.s6, data2.s6, data0.s7,
+                             data1.s7, data2.s7, data0.s8,
+                             data1.s8, data2.s8, data0.s9,
+                             data1.s9, data2.s9, data0.sA,
+                             data1.sA);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
+                             data2.sB, data0.sC, data1.sC,
+                             data2.sC, data0.sD, data1.sD,
+                             data2.sD, data0.sE, data1.sE,
+                             data2.sE, data0.sF, data1.sF,
+                             data2.sF);
+    vstore16(out2, 0, dst.ptr + 32);
+}
+
+/** This function combines three planes to a single RGBA image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
+ * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
+ * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
+ * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGBA8888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(plane3),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+    uchar16 data3 = vload16(0, plane3.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
+                             data0.s1, data1.s1, data2.s1, data3.s1,
+                             data0.s2, data1.s2, data2.s2, data3.s2,
+                             data0.s3, data1.s3, data2.s3, data3.s3);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
+                             data0.s5, data1.s5, data2.s5, data3.s5,
+                             data0.s6, data1.s6, data2.s6, data3.s6,
+                             data0.s7, data1.s7, data2.s7, data3.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
+                             data0.s9, data1.s9, data2.s9, data3.s9,
+                             data0.sA, data1.sA, data2.sA, data3.sA,
+                             data0.sB, data1.sB, data2.sB, data3.sB);
+    vstore16(out2, 0, dst.ptr + 32);
+
+    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
+                             data0.sD, data1.sD, data2.sD, data3.sD,
+                             data0.sE, data1.sE, data2.sE, data3.sE,
+                             data0.sF, data1.sF, data2.sF, data3.sF);
+    vstore16(out3, 0, dst.ptr + 48);
+}
+
+/** This function combines three planes to a single YUYV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_YUYV422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
+                             data0.s2, data1.s1, data0.s3, data2.s1,
+                             data0.s4, data1.s2, data0.s5, data2.s2,
+                             data0.s6, data1.s3, data0.s7, data2.s3);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
+                             data0.sA, data1.s5, data0.sB, data2.s5,
+                             data0.sC, data1.s6, data0.sD, data2.s6,
+                             data0.sE, data1.s7, data0.sF, data2.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single UYUV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_UYVY422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
+                             data1.s1, data0.s2, data2.s1, data0.s3,
+                             data1.s2, data0.s4, data2.s2, data0.s5,
+                             data1.s3, data0.s6, data2.s3, data0.s7);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
+                             data1.s5, data0.sA, data2.s5, data0.sB,
+                             data1.s6, data0.sC, data2.s6, data0.sD,
+                             data1.s7, data0.sE, data2.s7, data0.sF);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single NV12/NV21 image.
+ *
+ * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void channel_combine_NV(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+
+    // Create UV place
+    uchar8 data1 = vload8(0, src_plane1.ptr);
+    uchar8 data2 = vload8(0, src_plane2.ptr);
+
+#ifdef NV12
+    vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
+#elif defined(NV21)
+    vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
+#endif /* NV12 or NV21 */
+}
+
+/** This function combines three planes to a single YUV444 or IYUV image.
+ *
+ * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void copy_planes_3p(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    IMAGE_DECLARATION(dst_plane2),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+#ifdef YUV444
+    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#elif defined(IYUV)
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#endif /* YUV444 or IYUV */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/channel_extract.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_extract.clembed
new file mode 100644
index 0000000..7774edb
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_extract.clembed

@@ -0,0 +1,815 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function extracts a given channel from an RGB image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGB888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar8  data2 = vload8(0, src.ptr + 16);
+
+#ifdef CHANNEL_R
+    vstore4(data.s0369, 0, dst.ptr);
+    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
+#elif defined(CHANNEL_G)
+    vstore4(data.s147A, 0, dst.ptr);
+    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
+#elif defined(CHANNEL_B)
+    vstore4(data.s258B, 0, dst.ptr);
+    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
+}
+
+/** This function extracts a given channel from an RGBA image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGBA8888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar16 data2 = vload16(0, src.ptr + 16);
+
+#ifdef CHANNEL_R
+    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
+#elif defined(CHANNEL_G)
+    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
+#elif defined(CHANNEL_B)
+    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
+#elif defined(CHANNEL_A)
+    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
+}
+
+/** This function extracts a given channel from an YUYV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_YUYV422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#ifdef CHANNEL_Y
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined(CHANNEL_U)
+    vstore4(data.s159D, 0, dst.ptr);
+#elif defined(CHANNEL_V)
+    vstore4(data.s37BF, 0, dst.ptr);
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
+}
+
+/** This function extracts a given channel from an UYUV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_UYVY422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#ifdef CHANNEL_Y
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined(CHANNEL_U)
+    vstore4(data.s048C, 0, dst.ptr);
+#elif defined(CHANNEL_V)
+    vstore4(data.s26AE, 0, dst.ptr);
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
+}
+
+/** This function extracts a given channel from an NV12 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV12(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#ifdef CHANNEL_U
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined(CHANNEL_V)
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#endif /* CHANNEL_U or CHANNEL_V */
+}
+
+/** This function extracts a given channel from an NV21 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV21(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#ifdef CHANNEL_U
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined(CHANNEL_V)
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#endif /* CHANNEL_U or CHANNEL_V */
+}
+
+/** This function extracts a given plane from an multi-planar image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void copy_plane(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Copy plane data
+    vstore8(vload8(0, src.ptr), 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/channel_shuffle.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_shuffle.clembed
new file mode 100644
index 0000000..484e5a1
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/channel_shuffle.clembed

@@ -0,0 +1,724 @@
+R"(
+
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note This implementation is only defined for VEC_SIZE = 4
+ * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
+    uint       channel_id0  = 0;
+    uint       channel_id1  = 0;
+    uint       channel_id2  = 0;
+    uint       channel_id3  = 0;
+    uint       group_id0    = 0;
+    uint       group_id1    = 0;
+    uint       group_id2    = 0;
+    uint       group_id3    = 0;
+    uint       y            = 0;
+    uint       batch_id     = 0;
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
+    DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
+    DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
+    DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
+
+    const uint x  = get_global_id(1) * 2;
+    const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
+    const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
+    const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
+    const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr                                                              = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
+}
+#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/col2im.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/col2im.clembed
new file mode 100644
index 0000000..4b59425
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/col2im.clembed

@@ -0,0 +1,654 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+/** This kernel performs a reshaping of the output of the convolution layer
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
+ * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
+
+    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    uint  x         = get_global_id(0) * 8;
+    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
+
+    // Clamp x if out-of-bounds
+    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
+
+#if NUM_GROUPS > 1
+    // Compute output offset (batches on 4th dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
+
+    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
+    x_clamped += group * WIDTH_INPUT;
+#else  /* defined(NUM_GROUPS > 1 ) */
+    // Compute output offset (batches on 3rd dimension)
+    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
+}
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/color_convert.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/color_convert.clembed
new file mode 100644
index 0000000..7ff83e1
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/color_convert.clembed

@@ -0,0 +1,2454 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    uchar16 rgb_0 = vload16(0, in.ptr);
+    uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
+    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
+    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+    vstore16(rgba_2, 0, out.ptr + 32);
+    vstore16(rgba_3, 0, out.ptr + 48);
+}
+
+/** Convert an RGB888 image to U8
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: RGB888
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_U8_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    const uchar16 rgb_0 = vload16(0, in.ptr);
+    const uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    const uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
+    const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
+    const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
+    const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
+
+    const float16 rgb2u8_red_coef_bt709   = 0.2126f;
+    const float16 rgb2u8_green_coef_bt709 = 0.7152f;
+    const float16 rgb2u8_blue_coef_bt709  = 0.0722f;
+
+    //Computation of 16 greyscale values in float
+    const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
+
+    //Convert it to 16 grayscale uchar values
+    const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
+
+    vstore16(greyscale_u8_0, 0, out.ptr);
+}
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGBA8888_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+    // handle 16 pixels every time
+    uchar16 rgba_0 = vload16(0, in.ptr);
+    uchar16 rgba_1 = vload16(0, in.ptr + 16);
+    uchar16 rgba_2 = vload16(0, in.ptr + 32);
+    uchar16 rgba_3 = vload16(0, in.ptr + 48);
+
+    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
+    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
+    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore16(rgb_1, 0, out.ptr + 16);
+    vstore16(rgb_2, 0, out.ptr + 32);
+}
+
+/** Convert a UYVY422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 red_coef_bt709    = (float8)(1.5748f);
+    float8 green_coef_bt709  = (float8)(-0.1873f);
+    float8 green_coef2_bt709 = (float8)(-0.4681f);
+    float8 blue_coef_bt709   = (float8)(1.8556f);
+    float8 lumav             = convert_float8(luma);
+
+    float8 f_r = red_coef_bt709 * convert_float8(cr);
+    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
+    float8 f_b = blue_coef_bt709 * convert_float8(cb);
+
+    f_r += lumav;
+    f_g += lumav;
+    f_b += lumav;
+
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a UYVY422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 red_coef_bt709    = (float8)(1.5748f);
+    float8 green_coef_bt709  = (float8)(-0.1873f);
+    float8 green_coef2_bt709 = (float8)(-0.4681f);
+    float8 blue_coef_bt709   = (float8)(1.8556f);
+    float8 lumav             = convert_float8(luma);
+
+    float8 f_r = red_coef_bt709 * convert_float8(cr);
+    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
+    float8 f_b = blue_coef_bt709 * convert_float8(cb);
+
+    f_r += lumav;
+    f_g += lumav;
+    f_b += lumav;
+
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 red_coef_bt709    = (float8)(1.5748f);
+    float8 green_coef_bt709  = (float8)(-0.1873f);
+    float8 green_coef2_bt709 = (float8)(-0.4681f);
+    float8 blue_coef_bt709   = (float8)(1.8556f);
+    float8 lumav             = convert_float8(luma);
+
+    float8 f_r = red_coef_bt709 * convert_float8(cr);
+    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
+    float8 f_b = blue_coef_bt709 * convert_float8(cb);
+
+    f_r += lumav;
+    f_g += lumav;
+    f_b += lumav;
+
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 red_coef_bt709    = (float8)(1.5748f);
+    float8 green_coef_bt709  = (float8)(-0.1873f);
+    float8 green_coef2_bt709 = (float8)(-0.4681f);
+    float8 blue_coef_bt709   = (float8)(1.8556f);
+    float8 lumav             = convert_float8(luma);
+
+    float8 f_r = red_coef_bt709 * convert_float8(cr);
+    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
+    float8 f_b = blue_coef_bt709 * convert_float8(cb);
+
+    f_r += lumav;
+    f_g += lumav;
+    f_b += lumav;
+
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a RGB image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGB888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+*/
+
+/** Convert an NV12 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert a RGB image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4_rtz(f_y);
+    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
+    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGB image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert a RGBA image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4(f_y);
+    short4 i_u = convert_short4(f_u) + (short4)(128);
+    short4 i_v = convert_short4(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGBA image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGBA8888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a RGBA image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert an NV12 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an NV21 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert a UYVY image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void UYVY422_to_IYUV_bt709(
+    IMAGE_DECLARATION(uyvy_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 uyvy = vload16(0, in_uyvy.ptr);
+    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
+    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert a YUYV image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void YUYV422_to_IYUV_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv = vload16(0, in_yuyv.ptr);
+    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert an IYUV image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void IYUV_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb_src = vload8(0, in_u.ptr);
+    uchar8  cr_src = vload8(0, in_v.ptr);
+    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
+                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
+    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
+                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an IYUV image to NV12
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void IYUV_to_NV12_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb     = vload8(0, in_u.ptr);
+    uchar8  cr     = vload8(0, in_v.ptr);
+    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
+                               cr.s6, cb.s7, cr.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a YUYV image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void YUYV422_to_NV12_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
+    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a UYVY image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void UYVY422_to_NV12_bt709(
+    IMAGE_DECLARATION(input_uyvy),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 16 pixels every time, each line 8 pixels
+    const uchar16 uyvy_t = vload16(0, in.ptr);
+    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
+
+    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
+    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
+
+    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
+    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
+    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/comparisons.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/comparisons.clembed
new file mode 100644
index 0000000..8ecc9d3
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/comparisons.clembed

@@ -0,0 +1,693 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
+
+#define DEFINE_KERNEL_STR(name) compare_##name
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+
+#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
+#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
+ *
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
+/** This function compares two quantized tensors.
+ *
+ * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
+
+    in_a = in_a - (int16)((int)OFFSET_IN1);
+    in_b = in_b - (int16)((int)OFFSET_IN2);
+
+    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const int16   res    = OP(in1f32, in2f32);
+
+    // Store result
+    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/concatenate.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/concatenate.clembed
new file mode 100644
index 0000000..b969df3
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/concatenate.clembed

@@ -0,0 +1,970 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
+    return res_q8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+
+#if defined(INPUT1_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+#if VEC_SIZE == 2
+#define SEQ ((int2)(0, 1))
+#elif VEC_SIZE == 4
+#define SEQ ((int4)(0, 1, 2, 3))
+#elif VEC_SIZE == 8
+#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
+#elif VEC_SIZE == 16
+#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#else // VEC_SIZE
+#error "Vector size not supported"
+#endif // VEC_SIZE
+
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
+ * @param[in]  src1_pad_left                      Left paddings of the second input tensor in unit of elements
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst),
+    uint src1_pad_right,
+    uint src2_pad_left)
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x  = get_global_id(0) * (int)VEC_SIZE;
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
+ * @param[in]  src2_pad_left                      Left paddings of the second input tensor in unit of elements
+ * @param[in]  src2_pad_right                     Right paddings of the second input tensor in unit of elements
+ * @param[in]  src3_pad_left                      Left paddings of the third input tensor in unit of elements
+ * @param[in]  src3_pad_right                     Right paddings of the third input tensor in unit of elements
+ * @param[in]  src4_pad_left                      Left paddings of the fourth input tensor in unit of elements
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst),
+    uint src1_pad_right,
+    uint src2_pad_left,
+    uint src2_pad_right,
+    uint src3_pad_left,
+    uint src3_pad_right,
+    uint src4_pad_left)
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x = get_global_id(0) * (int)VEC_SIZE;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left), (int)INPUT2_WIDTH + (int)src2_pad_right - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)src3_pad_left), (int)INPUT3_WIDTH + (int)src3_pad_right - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)src4_pad_left);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
+
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values = select(src2_values, src1_values, cond_in2);
+    values = select(src3_values, values, cond_in3);
+    values = select(src4_values, values, cond_in4);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(DEPTH)
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_width(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    VSTORE(VEC_SIZE)
+    (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
+
+#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_height(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    VSTORE(VEC_SIZE)
+    (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convert_fc_weights.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convert_fc_weights.clembed
new file mode 100644
index 0000000..6a1b9a0
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convert_fc_weights.clembed

@@ -0,0 +1,601 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution3x3.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution3x3.clembed
new file mode 100644
index 0000000..0dbe5e2
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution3x3.clembed

@@ -0,0 +1,680 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
+                                                  const short left_coeff,
+                                                  const short middle_coeff,
+                                                  const short right_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src   A pointer to source Image structure
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2,
+    const short mat3, const short mat4, const short mat5,
+    const short mat6, const short mat7, const short mat8, uint scale)
+{
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    // Row 0
+    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
+    // Row
+    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
+    // Row 2
+    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
+
+    // Divide by the scale
+    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
+ * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution3x3_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution3x3(&src,
+                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution5x5.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution5x5.clembed
new file mode 100644
index 0000000..2aab613
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution5x5.clembed

@@ -0,0 +1,830 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x5(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
+           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    return out;
+}
+
+/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
+ * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
+ * [ mat10, mat11, mat12, mat13, mat14 ]\n
+ * [ mat15, mat16, mat17, mat18, mat19 ]\n
+ * [ mat20, mat21, mat22, mat23, mat24 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+short8 convolution5x5(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
+    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
+    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
+    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
+    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable5x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution5x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution5x5(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution7x7.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution7x7.clembed
new file mode 100644
index 0000000..632328e
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution7x7.clembed

@@ -0,0 +1,881 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x7(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
+            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution7x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    return out;
+}
+
+/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
+ * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
+ * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
+ * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
+ * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
+ * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
+ * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution7x7(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
+    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
+    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
+    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
+    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
+    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
+    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable7x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution7x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution7x7(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
+
+    // Clamp results to [ 0, 255 ] and store them in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution9x9.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution9x9.clembed
new file mode 100644
index 0000000..b606aae
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution9x9.clembed

@@ -0,0 +1,946 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the third left pixel
+ * @param[in] left4_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the third right pixel
+ * @param[in] right4_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x9(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           left4_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff,
+    const short           right4_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
+            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
+                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the third up pixel
+ * @param[in] up4_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ * @param[in] down4_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution9x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short up4_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff,
+    const short down4_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
+
+    return out;
+}
+
+/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
+ * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
+ * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
+ * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
+ * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
+ * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
+ * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
+ * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
+ * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] mat49 Coefficient from the convolution matrix
+ * @param[in] mat50 Coefficient from the convolution matrix
+ * @param[in] mat51 Coefficient from the convolution matrix
+ * @param[in] mat52 Coefficient from the convolution matrix
+ * @param[in] mat53 Coefficient from the convolution matrix
+ * @param[in] mat54 Coefficient from the convolution matrix
+ * @param[in] mat55 Coefficient from the convolution matrix
+ * @param[in] mat56 Coefficient from the convolution matrix
+ * @param[in] mat57 Coefficient from the convolution matrix
+ * @param[in] mat58 Coefficient from the convolution matrix
+ * @param[in] mat59 Coefficient from the convolution matrix
+ * @param[in] mat60 Coefficient from the convolution matrix
+ * @param[in] mat61 Coefficient from the convolution matrix
+ * @param[in] mat62 Coefficient from the convolution matrix
+ * @param[in] mat63 Coefficient from the convolution matrix
+ * @param[in] mat64 Coefficient from the convolution matrix
+ * @param[in] mat65 Coefficient from the convolution matrix
+ * @param[in] mat66 Coefficient from the convolution matrix
+ * @param[in] mat67 Coefficient from the convolution matrix
+ * @param[in] mat68 Coefficient from the convolution matrix
+ * @param[in] mat69 Coefficient from the convolution matrix
+ * @param[in] mat70 Coefficient from the convolution matrix
+ * @param[in] mat71 Coefficient from the convolution matrix
+ * @param[in] mat72 Coefficient from the convolution matrix
+ * @param[in] mat73 Coefficient from the convolution matrix
+ * @param[in] mat74 Coefficient from the convolution matrix
+ * @param[in] mat75 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat77 Coefficient from the convolution matrix
+ * @param[in] mat78 Coefficient from the convolution matrix
+ * @param[in] mat79 Coefficient from the convolution matrix
+ * @param[in] mat80 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution9x9(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
+    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
+    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
+    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
+    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
+    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
+    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
+    const short mat80, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
+    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
+    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
+    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
+    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
+    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
+    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
+    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
+    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable9x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
+
+    // Divide by the scale
+    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution9x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution9x9(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
+                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
+                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
+                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_layer.clembed
new file mode 100644
index 0000000..3389368
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_layer.clembed

@@ -0,0 +1,655 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(NUM_GROUPS)
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif /* HAS_BIAS */
+    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#ifdef HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif /* HAS_BIAS */
+
+    if(is_last_thread)
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#ifdef HAS_BIAS
+                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+                tmp_bias_ptr += bias_stride_x;
+#endif /* HAS_BIAS */
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+    else
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+}
+#endif // defined(DATA_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_rectangle.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_rectangle.clembed
new file mode 100644
index 0000000..e3e94a1
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/convolution_rectangle.clembed

@@ -0,0 +1,3978 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
+                                                  const short left_coeff,
+                                                  const short middle_coeff,
+                                                  const short right_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src   A pointer to source Image structure
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2,
+    const short mat3, const short mat4, const short mat5,
+    const short mat6, const short mat7, const short mat8, uint scale)
+{
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    // Row 0
+    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
+    // Row
+    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
+    // Row 2
+    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
+
+    // Divide by the scale
+    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
+ * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution3x3_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution3x3(&src,
+                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x5(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
+           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    return out;
+}
+
+/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
+ * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
+ * [ mat10, mat11, mat12, mat13, mat14 ]\n
+ * [ mat15, mat16, mat17, mat18, mat19 ]\n
+ * [ mat20, mat21, mat22, mat23, mat24 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+short8 convolution5x5(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
+    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
+    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
+    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
+    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable5x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution5x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution5x5(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x7(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
+            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution7x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    return out;
+}
+
+/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
+ * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
+ * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
+ * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
+ * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
+ * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
+ * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution7x7(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
+    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
+    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
+    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
+    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
+    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
+    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable7x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution7x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution7x7(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
+
+    // Clamp results to [ 0, 255 ] and store them in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the third left pixel
+ * @param[in] left4_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the third right pixel
+ * @param[in] right4_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x9(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           left4_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff,
+    const short           right4_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
+            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
+                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the third up pixel
+ * @param[in] up4_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ * @param[in] down4_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution9x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short up4_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff,
+    const short down4_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
+
+    return out;
+}
+
+/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
+ * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
+ * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
+ * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
+ * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
+ * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
+ * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
+ * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
+ * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] mat49 Coefficient from the convolution matrix
+ * @param[in] mat50 Coefficient from the convolution matrix
+ * @param[in] mat51 Coefficient from the convolution matrix
+ * @param[in] mat52 Coefficient from the convolution matrix
+ * @param[in] mat53 Coefficient from the convolution matrix
+ * @param[in] mat54 Coefficient from the convolution matrix
+ * @param[in] mat55 Coefficient from the convolution matrix
+ * @param[in] mat56 Coefficient from the convolution matrix
+ * @param[in] mat57 Coefficient from the convolution matrix
+ * @param[in] mat58 Coefficient from the convolution matrix
+ * @param[in] mat59 Coefficient from the convolution matrix
+ * @param[in] mat60 Coefficient from the convolution matrix
+ * @param[in] mat61 Coefficient from the convolution matrix
+ * @param[in] mat62 Coefficient from the convolution matrix
+ * @param[in] mat63 Coefficient from the convolution matrix
+ * @param[in] mat64 Coefficient from the convolution matrix
+ * @param[in] mat65 Coefficient from the convolution matrix
+ * @param[in] mat66 Coefficient from the convolution matrix
+ * @param[in] mat67 Coefficient from the convolution matrix
+ * @param[in] mat68 Coefficient from the convolution matrix
+ * @param[in] mat69 Coefficient from the convolution matrix
+ * @param[in] mat70 Coefficient from the convolution matrix
+ * @param[in] mat71 Coefficient from the convolution matrix
+ * @param[in] mat72 Coefficient from the convolution matrix
+ * @param[in] mat73 Coefficient from the convolution matrix
+ * @param[in] mat74 Coefficient from the convolution matrix
+ * @param[in] mat75 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat77 Coefficient from the convolution matrix
+ * @param[in] mat78 Coefficient from the convolution matrix
+ * @param[in] mat79 Coefficient from the convolution matrix
+ * @param[in] mat80 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution9x9(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
+    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
+    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
+    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
+    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
+    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
+    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
+    const short mat80, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
+    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
+    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
+    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
+    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
+    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
+    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
+    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
+    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable9x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
+
+    // Divide by the scale
+    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution9x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution9x9(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
+                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
+                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
+                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define MAT_INDEX(i) MAT##i
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif /* DATA_TYPE */
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif /* COMPUTE_TYPE */
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif /* DATA_TYPE_OUT */
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_rectangle(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short matrix_coeff[81] =
+    {
+        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
+        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
+        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
+        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
+        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
+        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
+        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
+        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
+        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
+    };
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
+
+    for(int i = 0; i < MATRIX_HEIGHT; i++)
+    {
+#if MATRIX_WIDTH == 3
+        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
+                                 matrix_coeff[2 + i * 3]);
+#endif /* MATRIX_WIDTH */
+
+#if MATRIX_WIDTH == 5
+        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
+                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
+#endif /* MATRIX_WIDTH */
+
+#if MATRIX_WIDTH == 7
+        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
+                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
+                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
+#endif /* MATRIX_WIDTH */
+
+#if MATRIX_WIDTH == 9
+        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
+                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
+                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
+#endif /* MATRIX_WIDTH */
+    }
+
+    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
+}
+
+#endif /* not DYNAMIC_MATRIX_CONVOLUTION */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/copy_tensor.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/copy_tensor.clembed
new file mode 100644
index 0000000..5f7a6a4
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/copy_tensor.clembed

@@ -0,0 +1,671 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(PAD00) && defined(PAD10) && defined(PAD20) && defined(PAD21) && defined(PAD30) && defined(DATA_TYPE) && defined(VEC_SIZE) // Compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DPAD{d}{0,1} = padding before{0} and after{1} dimension d (d < 4)
+ * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
+ * -# -DDATA_TYPE = Input and output datatypes.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_pad_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int offset_x = PAD00;
+    const int offset_y = PAD10;
+    const int offset_z = PAD20;
+
+#if PAD30 > 0
+    const size_t in_batch    = get_global_id(2) / DEPTH;
+    const int    total_depth = DEPTH + PAD20 + PAD21;
+    const int    offset_w    = PAD30 * total_depth + in_batch * (PAD20 + PAD21);
+#else  // PAD30 == 0
+    const int offset_w = 0;
+#endif // PAD30
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor3D_offset(&out, offset_x, offset_y, offset_z + offset_w));
+}
+#endif // Compile time constants
+
+#if defined(DATA_TYPE)
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr));
+#endif // defined(VEC_SIZE)
+}
+#endif // defined(DATA_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/crop_tensor.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/crop_tensor.clembed
new file mode 100644
index 0000000..2c08fb3
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/crop_tensor.clembed

@@ -0,0 +1,639 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) // Compile time constants
+
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U16/S16/F16/U32/S32/F32
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  in_offset_y                       The initial offset of the input address along Y.
+ * @param[in]  in_offset_z                       The initial offset of the input address along Z.
+ */
+__kernel void crop_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    int in_offset_y,
+    int in_offset_z)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
+
+#if defined(WIDTH_FLIPPED)
+    const int in_y = in_offset_y - get_global_id(1);
+#else  // defined(WIDTH_FLIPPED)
+    const int in_y                 = in_offset_y + get_global_id(1);
+#endif // defined(WIDTH_FLIPPED)
+
+#if defined(HEIGHT_FLIPPED)
+    const int in_z = in_offset_z - get_global_id(2);
+#else  // defined(HEIGHT_FLIPPED)
+    const int in_z                 = in_offset_z + get_global_id(2);
+#endif // defined(HEIGHT_FLIPPED)
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/deconvolution_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/deconvolution_layer.clembed
new file mode 100644
index 0000000..def9fa9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/deconvolution_layer.clembed

@@ -0,0 +1,673 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function applies upsample on an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void deconvolution_upsample(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+    // Store result
+    const int x_in = get_global_id(0);
+    const int y_in = get_global_id(1);
+    const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+    const int bias_index = x_in / (FILTER_AREA);
+    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else  // defined(NUM_FILTERS)
+    const int x_out      = x_in / (FILTER_AREA);
+    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+    const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else  // defined(ADD_BIAS)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/depth_convert.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/depth_convert.clembed
new file mode 100644
index 0000000..586008c
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/depth_convert.clembed

@@ -0,0 +1,678 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifdef SATURATE
+#if defined(IS_DATA_TYPE_FLOAT)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#else /* defined(IS_DATA_TYPE_FLOAT) */
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+#else  /* SATURATE */
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, type) CONVERT(x, type)
+
+/** This function performs a down-scaling depth conversion.
+ *
+ * @attention For QSYMM8_PER_CHANNEL -> QASYMM8, it is user's responsibility to keep track of the quantization info.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_down(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(IS_DATA_TYPE_QUANTIZED)
+    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
+#endif // defined(IS_DATA_TYPE_QUANTIZED)
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VSTORE(VEC_SIZE)
+    (CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VSTORE(VEC_SIZE)
+    (CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
+
+/** This function performs a up-scaling depth conversion.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_up(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VSTORE(VEC_SIZE)
+    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VSTORE(VEC_SIZE)
+    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/depth_to_space.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/depth_to_space.clembed
new file mode 100644
index 0000000..bddb25e
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/depth_to_space.clembed

@@ -0,0 +1,654 @@
+R"(
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution.clembed
new file mode 100644
index 0000000..6897e32
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution.clembed

@@ -0,0 +1,3001 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+/** Get the pointer position at a certain offset in x and y direction.
+ *
+ * @param[in] ptr      Pointer to the starting position of the buffer
+ * @param[in] x        Relative X position
+ * @param[in] y        Relative Y position
+ * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
+ *
+ * @return a uchar
+ */
+inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
+{
+    return ptr + x * stride_x + y * stride_y;
+}
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);            \
+        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);            \
+        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);            \
+        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);            \
+        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);            \
+        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);            \
+    })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);                  \
+        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);                  \
+        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);                  \
+        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);                  \
+        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);                  \
+        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);                  \
+        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);                  \
+    })
+
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
+        acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2);                                  \
+        acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2);                                   \
+        acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2);                                 \
+        acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3);                                  \
+        acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3);                                   \
+        acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
+        acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2);                                  \
+        acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2);                                   \
+        acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2);                                 \
+        acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3);                                  \
+        acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3);                                   \
+        acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3);                                 \
+    })
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
+#if defined(CONV_STRIDE_X)
+
+#if CONV_STRIDE_X == 1
+#define convolution1x3 convolution1x3_stride_1
+#elif CONV_STRIDE_X == 2
+#define convolution1x3 convolution1x3_stride_2
+#elif CONV_STRIDE_X == 3
+#define convolution1x3 convolution1x3_stride_3
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+    float4 temp = vload4(0, (__global float *)left_pixel);
+
+    float2 left   = CONVERT(temp.s01, float2);
+    float2 middle = CONVERT(temp.s12, float2);
+    float2 right  = CONVERT(temp.s23, float2);
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
+           + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
+           + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+    float4 temp0 = vload4(0, (__global float *)left_pixel);
+    float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
+
+    float2 left   = CONVERT(temp0.s02, float2);
+    float2 middle = CONVERT(temp0.s13, float2);
+    float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
+
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+    __global float *left_pixel_float = (__global float *)left_pixel;
+
+    return vload4(0, left_pixel_float).s02 * (float2)left_coeff
+           + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
+           + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+    float4 temp0 = vload4(0, (__global float *)left_pixel);
+    float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
+
+    float2 left   = CONVERT(temp0.s03, float2);
+    float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
+    float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
+
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    __global float *left_pixel_float = (__global float *)left_pixel;
+
+    return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
+           + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
+           + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src  A pointer to source Image structure
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat1 Coefficient from the convolution matrix
+ * @param[in] mat2 Coefficient from the convolution matrix
+ * @param[in] mat3 Coefficient from the convolution matrix
+ * @param[in] mat4 Coefficient from the convolution matrix
+ * @param[in] mat5 Coefficient from the convolution matrix
+ * @param[in] mat6 Coefficient from the convolution matrix
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat7 Coefficient from the convolution matrix
+ * @param[in] mat8 Coefficient from the convolution matrix
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution3x3(
+    __global const uchar *src,
+    unsigned int          src_stride_y,
+    const float mat0, const float mat1, const float mat2,
+    const float mat3, const float mat4, const float mat5,
+    const float mat6, const float mat7, const float mat8)
+{
+    float2 pixels;
+
+    pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
+    pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
+    pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
+
+    return pixels;
+}
+
+/** This OpenCL kernel computes the depthwise convolution 3x3
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    float2 pixels = 0.0f;
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+
+    __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
+    // Load the weights
+    float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    pixels = convolution3x3(src_addr, src_stride_y,
+                            weights_values0.s0, weights_values0.s1, weights_values0.s2,
+                            weights_values1.s0, weights_values1.s1, weights_values1.s2,
+                            weights_values2.s0, weights_values2.s1, weights_values2.s2);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = *((__global float *)(vector_offset(&biases, channel)));
+
+    pixels += (float2)bias;
+#endif //defined(HAS_BIAS)
+
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
+}
+#endif //defined(CONV_STRIDE_X)
+
+#if(DILATION_X > 1 || DILATION_Y > 1)
+
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    float2 pixels0 = 0.0f;
+
+    float2 src00_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    float2 src00_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    float2 src10_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    float2 src10_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    float2 src20_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    float2 src20_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    float2 pixels0 = 0.0f;
+
+    float3 src00_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    float3 src00_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    float3 src10_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    float3 src10_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    float3 src20_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    float3 src20_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
+ * stride_x and stride_y are equal to 1
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    float2 pixels0 = 0.0f;
+    float2 pixels1 = 0.0f;
+    float2 pixels2 = 0.0f;
+    float2 pixels3 = 0.0f;
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
+
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 1st row
+    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 3rd row
+    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = *((__global float *)(vector_offset(&biases, channel)));
+
+    pixels0 += (float2)bias;
+    pixels1 += (float2)bias;
+    pixels2 += (float2)bias;
+    pixels3 += (float2)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
+ * stride_x and stride_y are equal to 2
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    float2 pixels0 = 0.0f;
+    float2 pixels1 = 0.0f;
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
+    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+    float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
+
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = *((__global float *)(vector_offset(&biases, channel)));
+
+    pixels0 += (float2)bias;
+    pixels1 += (float2)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+/** Reshape the weights for quantized depthwise convolution
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
+ * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
+ * @attention Input's height and width should be 3
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_convolution_reshape_weights(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Vector    src = CONVERT_TO_VECTOR_STRUCT(src);
+    const int x   = get_global_id(0);
+
+    // Load 3x3xVEC_SIZE weights
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
+
+#if defined(TRANSPOSE)
+#if VEC_SIZE != 4
+#error "VEC_SIZE not supported"
+#else  // VEC_SIZE != 4
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // VEC_SIZE != 4
+#else  // !defined(TRANSPOSE)
+    VSTORE(VEC_SIZE)
+    (w0, 0, dst_addr + 0);
+    VSTORE(VEC_SIZE)
+    (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // defined(TRANSPOSE)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
+#if defined(CONV_STRIDE_X)
+#if CONV_STRIDE_X == 1
+#define convolution1x3_f16 convolution1x3_stride_1_f16
+#elif CONV_STRIDE_X == 2
+#define convolution1x3_f16 convolution1x3_stride_2_f16
+#elif CONV_STRIDE_X == 3
+#define convolution1x3_f16 convolution1x3_stride_3_f16
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+#if(DILATION_X > 1 || DILATION_Y > 1)
+
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    half4 pixels0 = 0.0f;
+
+    half4 src00_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    half4 src00_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    half4 src10_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    half4 src10_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    half4 src20_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    half4 src20_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    half4 pixels0 = 0.0f;
+
+    half8 src00_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    half8 src00_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    half8 src10_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    half8 src10_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    half8 src20_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    half8 src20_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+#endif // (DILATION_X > 1 && DILATION_Y > 1)
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+    half8 temp = vload8(0, (__global half *)left_pixel);
+
+    half4 left   = CONVERT(temp.s0123, half4);
+    half4 middle = CONVERT(temp.s1234, half4);
+    half4 right  = CONVERT(temp.s2345, half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+    return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
+           + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
+           + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+    half8 temp0 = vload8(0, (__global half *)left_pixel);
+    half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
+
+    half4 left   = CONVERT(temp0.s0246, half4);
+    half4 middle = CONVERT(temp0.s1357, half4);
+    half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    __global half *left_pixel_float = (__global half *)left_pixel;
+
+    return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
+           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
+           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+    half16 temp0 = vload16(0, (__global half *)left_pixel);
+
+    half4 left   = CONVERT(temp0.s0369, half4);
+    half4 middle = CONVERT(temp0.s147A, half4);
+    half4 right  = CONVERT(temp0.s258B, half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    __global half *left_pixel_float = (__global half *)left_pixel;
+
+    return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
+           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
+           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+}
+
+/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src  A pointer to source Image structure
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat1 Coefficient from the convolution matrix
+ * @param[in] mat2 Coefficient from the convolution matrix
+ * @param[in] mat3 Coefficient from the convolution matrix
+ * @param[in] mat4 Coefficient from the convolution matrix
+ * @param[in] mat5 Coefficient from the convolution matrix
+ * @param[in] mat6 Coefficient from the convolution matrix
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat7 Coefficient from the convolution matrix
+ * @param[in] mat8 Coefficient from the convolution matrix
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution3x3_f16(
+    Image     *src,
+    const half mat0, const half mat1, const half mat2,
+    const half mat3, const half mat4, const half mat5,
+    const half mat6, const half mat7, const half mat8)
+{
+    half4 pixels;
+
+    pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
+    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
+    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
+
+    return pixels;
+}
+
+#if defined(DEPTH_MULTIPLIER)
+
+/** This OpenCL kernel computes the depthwise convolution 3x3
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif //defined(HAS_BIAS)
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+
+    uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
+    half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
+    half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
+    half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
+
+    half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+                                      weights_values1.s0, weights_values1.s1, weights_values1.s2,
+                                      weights_values2.s0, weights_values2.s1, weights_values2.s2);
+#if defined(HAS_BIAS)
+    pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
+#endif //defined(HAS_BIAS)
+
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
+}
+#endif // defined(DEPTH_MULTIPLIER)
+#endif // defined(CONV_STRIDE_X)
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 1
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    half bias = *((__global half *)(vector_offset(&biases, channel)));
+#endif /* defined(HAS_BIAS) */
+
+    half4 pixels0 = 0.0f;
+    half4 pixels1 = 0.0f;
+    half4 pixels2 = 0.0f;
+    half4 pixels3 = 0.0f;
+
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
+    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+    half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
+
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 1st row
+    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 3rd row
+    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#ifdef HAS_BIAS
+    pixels0 += (half4)bias;
+    pixels1 += (half4)bias;
+    pixels2 += (half4)bias;
+    pixels3 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+}
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 2
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    half bias = *((__global half *)(vector_offset(&biases, channel)));
+#endif /* defined(HAS_BIAS) */
+
+    half4 pixels0 = 0.0f;
+    half4 pixels1 = 0.0f;
+
+    // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
+    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+    half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
+
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels1                 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#ifdef HAS_BIAS
+    pixels0 += (half4)bias;
+    pixels1 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
+
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
+/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
+ * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
+ * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
+ * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
+ * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
+ * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void dwc_MxN_native_fp_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(HAS_BIAS)
+)
+{
+    int x = get_global_id(0); // channels
+    int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int z                   = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
+
+    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
+
+    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+
+#if defined(HAS_BIAS)
+    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+#endif // defined(HAS_BIAS)
+
+#if defined(DST_DEPTH)
+    s_addr += b * src_stride_w;
+    d_addr += b * dst_stride_w;
+#endif // defined(DST_DEPTH)
+
+    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
+    {
+        // Each work-item computes N0x1x1 elements
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        res = 0;
+
+        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
+        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
+
+        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+        {
+            if(y_coord >= 0 && y_coord < SRC_DIM2)
+            {
+                int x_coord_tmp = x_coord;
+
+                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+                {
+                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
+                    {
+                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
+                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
+
+                        // Load input and weights values
+                        VEC_DATA_TYPE(DATA_TYPE, N0)
+                        i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
+                        VEC_DATA_TYPE(DATA_TYPE, N0)
+                        w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+                        res += i * w;
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+                        res = fma(i, w, res);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+                    }
+                    x_coord_tmp += DILATION_X;
+                }
+            }
+            y_coord += DILATION_Y;
+        }
+
+#if defined(HAS_BIAS)
+        res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
+#endif // defined(HAS_BIAS)
+
+        res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+
+        VSTORE(N0)
+        (res, 0, (__global DATA_TYPE *)(d_addr));
+
+        w_addr += sizeof(DATA_TYPE);
+        d_addr += sizeof(DATA_TYPE);
+#if defined(HAS_BIAS)
+        b_addr += sizeof(DATA_TYPE);
+#endif // defined(HAS_BIAS)
+    }
+}
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
+
+#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
+
+#if DATA_TYPE != float || DATA_TYPE != half
+#error "Unsupported data type"
+#endif // DATA_TYPE != float || DATA_TYPE != half
+
+#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+
+/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
+ * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] max_offset                            Max offset for the input tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    int max_offset)
+{
+    int x = get_global_id(0); // channels
+    int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
+
+    int  z_coord  = 0;
+    int4 offset   = 0;
+    int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+
+    // We compute 2x1x1 [C,W,H] elements
+    VEC_FLOAT acc = 0;
+
+    // Load weights
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+
+    // Load input values
+    // z == 0
+    // Clamp z_coord as for z = 0, it can be negative
+    // z_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
+    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    // z == 1
+    // z_coord can be only negative for z = 0 so we do not need to clamp it
+    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
+    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
+    offset            = y_offset + (int4)(z_coord * src_stride_z);
+    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    // z == 2
+    // Offset can be out-of-bound so we need to check if it is greater than max_offset
+    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+    offset            = y_offset + (int4)(z_coord * src_stride_z);
+    offset            = min(offset, (int4)max_offset);
+    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    acc = fma(values0, w0, acc);
+    acc = fma(values1, w1, acc);
+    acc = fma(values2, w2, acc);
+
+    acc = fma(values3, w3, acc);
+    acc = fma(values4, w4, acc);
+    acc = fma(values5, w5, acc);
+
+    acc = fma(values6, w6, acc);
+    acc = fma(values7, w7, acc);
+    acc = fma(values8, w8, acc);
+
+#if defined(HAS_BIAS)
+    Vector    biases      = CONVERT_TO_VECTOR_STRUCT(biases);
+    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+    acc += bias_values;
+#endif // defined(HAS_BIAS)
+
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
+    VSTORE(VEC_SIZE)
+    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
+}
+#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
+ * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ *
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] max_offset                            Max offset for the input tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_nhwc_stride1(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    int max_offset)
+{
+    int x = get_global_id(0); // channels
+    int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int             z        = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
+
+    int  z_coord  = 0;
+    int4 offset   = 0;
+    int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+
+    // We compute 2x2x2 [C,W,H] elements
+    VEC_FLOAT acc0 = 0;
+    VEC_FLOAT acc1 = 0;
+    VEC_FLOAT acc2 = 0;
+    VEC_FLOAT acc3 = 0;
+
+    // Load weights
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+
+    // Load input values
+    // z == 0
+    // Clamp z_coord as for z = 0, it can be negative
+    // z_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 1
+    // z_coord can be only negative for z = 0 so we do not need to clamp it
+    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
+    z_coord           = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+    offset            = y_offset + (int4)(z_coord * src_stride_z);
+    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 2
+    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
+    // However offset can be out-of-bound so we need to check if it is greater than max_offset
+    offset += (int4)src_stride_z;
+    offset             = min(offset, (int4)max_offset);
+    VEC_FLOAT values8  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values9  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 3
+    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
+    // However offset can be out-of-bound so we need to check if it is greater than max_offset
+    offset += (int4)src_stride_z;
+    offset             = min(offset, (int4)max_offset);
+    VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    acc0 = fma(values0, w0, acc0);
+    acc0 = fma(values1, w1, acc0);
+    acc0 = fma(values2, w2, acc0);
+    acc1 = fma(values1, w0, acc1);
+    acc1 = fma(values2, w1, acc1);
+    acc1 = fma(values3, w2, acc1);
+
+    acc0 = fma(values4, w3, acc0);
+    acc0 = fma(values5, w4, acc0);
+    acc0 = fma(values6, w5, acc0);
+    acc1 = fma(values5, w3, acc1);
+    acc1 = fma(values6, w4, acc1);
+    acc1 = fma(values7, w5, acc1);
+
+    acc0 = fma(values8, w6, acc0);
+    acc0 = fma(values9, w7, acc0);
+    acc0 = fma(values10, w8, acc0);
+    acc1 = fma(values9, w6, acc1);
+    acc1 = fma(values10, w7, acc1);
+    acc1 = fma(values11, w8, acc1);
+
+    acc2 = fma(values4, w0, acc2);
+    acc2 = fma(values5, w1, acc2);
+    acc2 = fma(values6, w2, acc2);
+    acc3 = fma(values5, w0, acc3);
+    acc3 = fma(values6, w1, acc3);
+    acc3 = fma(values7, w2, acc3);
+
+    acc2 = fma(values8, w3, acc2);
+    acc2 = fma(values9, w4, acc2);
+    acc2 = fma(values10, w5, acc2);
+    acc3 = fma(values9, w3, acc3);
+    acc3 = fma(values10, w4, acc3);
+    acc3 = fma(values11, w5, acc3);
+
+    acc2 = fma(values12, w6, acc2);
+    acc2 = fma(values13, w7, acc2);
+    acc2 = fma(values14, w8, acc2);
+    acc3 = fma(values13, w6, acc3);
+    acc3 = fma(values14, w7, acc3);
+    acc3 = fma(values15, w8, acc3);
+
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+
+    acc0 += bias_values;
+    acc1 += bias_values;
+    acc2 += bias_values;
+    acc3 += bias_values;
+#endif // defined(HAS_BIAS)
+
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
+    VSTORE(VEC_SIZE)
+    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+
+#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
+    if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
+#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
+    {
+        VSTORE(VEC_SIZE)
+        (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+        VSTORE(VEC_SIZE)
+        (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+    }
+}
+
+#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution_quantized.clembed
new file mode 100644
index 0000000..e59cc38
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/depthwise_convolution_quantized.clembed

@@ -0,0 +1,4218 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+#ifndef VEC_SIZE
+#if defined(N0)
+#define VEC_SIZE N0
+#else /* defined(N0) */
+#define VEC_SIZE 8
+#endif /* defined(N0) */
+#endif /* VEC_SIZE */
+
+#if defined(ACTIVATION_TYPE) && defined(CONST_0)
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(S1_VAL) && !defined(S2_VAL)
+#define S2_VAL S1_VAL
+#endif // defined(S1_VAL) && !defined(S2_VAL)
+#if defined(O1_VAL) && !defined(O2_VAL)
+#define O2_VAL O1_VAL
+#endif // defined(O1_VAL) && !defined(O2_VAL)
+
+// RELU Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max((TYPE)CONST_0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max((TYPE)CONST_0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(S1_VAL) && defined(S2_VAL)
+#if defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
+    ({                                                                                            \
+        data = ACTIVATION_OP(act, data);                                                          \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
+        \
+        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
+    })
+#else // defined(O1_VAL) && defined(O2_VAL)
+#define PERFORM_ACTIVATION_QUANT(act, data)                             \
+    ({                                                                  \
+        data = ACTIVATION_OP(act, data);                                \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                  \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
+        \
+        fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
+    })
+#endif /* defined(O1_VAL) && defined(O2_VAL) */
+#else  /* defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QUANT(act, data) \
+    ({                                      \
+        data = ACTIVATION_OP(act, data);    \
+    })
+#endif /* defined(S1_VAL) && defined(S2_VAL) */
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+
+#if defined(FLOAT_DOMAIN)
+// Activations performed in the float domain
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+/** This performs an activation function on quantized inputs with float transformations.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant_f32(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+#if defined(O1_VAL)
+    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+#else  // defined(O1_VAL)
+    data_flt        = round(data_flt) * ((float)S1_VAL);
+#endif // defined(O1_VAL)
+    data_flt = ACTIVATION(ACT, float, data_flt, A_VAL, B_VAL);
+
+#if defined(O2_VAL)
+    data = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+#else  // defined(O2_VAL)
+    data            = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+#endif // defined(O2_VAL)
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#else // defined(FLOAT_DOMAIN)
+// Activations performed in the quantized domain
+
+#if defined(ACT)
+/** This performs an activation function on quantized inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    data = PERFORM_ACTIVATION_QUANT(ACT, data);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(ACT)
+#endif // defined(FLOAT_DOMAIN)
+#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
+#else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE)
+
+#if defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
+
+#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
+
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
+
+#if defined(WEIGHTS_PROMOTED_TYPE)
+#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+
+#if CONV_STRIDE_X > 3
+#error "Stride X not supported"
+#endif /* CONV_STRIDE_X > 3 */
+
+#if !defined(IS_DOT8)
+
+#if DILATION_X == 1
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                                                        \
+    ({                                                                                                      \
+        int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                         \
+        int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \
+        \
+        left   = CONVERT(temp0.s01234567, int8);                                                            \
+        middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);                                  \
+        right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);                                  \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                                                 \
+    ({                                                                                               \
+        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);               \
+        int temp1   = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \
+        \
+        left   = CONVERT(temp0.s02468ace, int8);                                                     \
+        middle = CONVERT(temp0.s13579bdf, int8);                                                     \
+        right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);                              \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                                                          \
+    ({                                                                                                        \
+        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                        \
+        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \
+        \
+        left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                    \
+        middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);                                    \
+        right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);                                    \
+    })
+#endif /* CONV_STRIDE_X */
+
+#else /* DILATION_X == 1 */
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                                                                 \
+    ({                                                                                                               \
+        left   = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                                      \
+        middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8);     \
+        right  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                                                                  \
+    ({                                                                                                                \
+        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                \
+        left        = CONVERT(temp0.s02468ace, int8);                                                                 \
+        \
+        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);    \
+        middle = CONVERT(temp0.s02468ace, int8);                                                                      \
+        \
+        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \
+        right = CONVERT(temp0.s02468ace, int8);                                                                       \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                                                                       \
+    ({                                                                                                                     \
+        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                     \
+        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8);              \
+        left        = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                            \
+        \
+        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);         \
+        temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8);    \
+        middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                 \
+        \
+        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16);      \
+        temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \
+        right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                  \
+    })
+
+#endif /* CONV_STRIDE_X */
+#endif /* DILATION_X==1 */
+
+/** This function computes the depthwise convolution quantized.
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void dwc_3x3_native_quantized8_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Vector   output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
+    Vector   output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    int bias_value = *((__global int *)(vector_offset(&biases, channel)));
+#endif //defined(HAS_BIAS)
+
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
+    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
+    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
+    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
+    const int output_shift      = *((__global int *)vector_offset(&output_shifts, channel));
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    int8 values0 = 0;
+    int8 sum0    = 0;
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    int8 values1 = 0;
+    int8 sum1    = 0;
+#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
+
+    // Row0
+    int8 left, middle, right;
+    GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w0.s0);
+    values0 += middle * (int8)(w0.s1);
+    values0 += right * (int8)(w0.s2);
+
+#if WEIGHTS_OFFSET != 0
+    sum0 += left + middle + right;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+    // Row1
+    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w1.s0);
+    values0 += middle * (int8)(w1.s1);
+    values0 += right * (int8)(w1.s2);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += left * (int8)(w0.s0);
+    values1 += middle * (int8)(w0.s1);
+    values1 += right * (int8)(w0.s2);
+#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
+
+#if WEIGHTS_OFFSET != 0
+    int8 tmp = left + middle + right;
+    sum0 += tmp;
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    sum1 += tmp;
+#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+    // Row2
+    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w2.s0);
+    values0 += middle * (int8)(w2.s1);
+    values0 += right * (int8)(w2.s2);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += left * (int8)(w1.s0);
+    values1 += middle * (int8)(w1.s1);
+    values1 += right * (int8)(w1.s2);
+#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
+
+#if WEIGHTS_OFFSET != 0
+    tmp = left + middle + right;
+    sum0 += tmp;
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    sum1 += tmp;
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    // Row3
+    GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
+    values1 += left * (int8)(w2.s0);
+    values1 += middle * (int8)(w2.s1);
+    values1 += right * (int8)(w2.s2);
+
+#if WEIGHTS_OFFSET != 0
+    sum1 += left + middle + right;
+#endif /* WEIGHTS_OFFSET != 0 */
+#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
+
+#if defined(HAS_BIAS)
+    values0 += (int8)(bias_value);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += (int8)(bias_value);
+#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
+#endif //defined(HAS_BIAS)
+
+#if WEIGHTS_OFFSET != 0
+    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    VEC_WEIGHTS_PROMOTED_TYPE(3)
+    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
+
+    WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
+    values0 += sum_weights * (int8)(INPUT_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += sum_weights * (int8)(INPUT_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    values0 += (int8)(K_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += (int8)(K_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
+#endif /* K_OFFSET != 0 */
+
+#if defined(REAL_MULTIPLIER)
+
+    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0                            = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    values0 += (int8)OUTPUT_OFFSET;
+    VEC_TYPE(8)
+    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
+
+    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+#if defined(REAL_MULTIPLIER)
+
+    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1                  = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    values1 += (int8)OUTPUT_OFFSET;
+    VEC_TYPE(8)
+    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
+
+    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
+}
+
+#else // !defined(IS_DOT8)
+
+#if DILATION_X == 1
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                                    \
+    ({                                                                                  \
+        VEC_TYPE(8)                                                                     \
+        temp0 = vload8(0, (__global DATA_TYPE *)(first_value));                         \
+        VEC_TYPE(2)                                                                     \
+        temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \
+        \
+        left   = temp0.s01234567;                                                       \
+        middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0);                      \
+        right  = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01);                      \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                                       \
+    ({                                                                                     \
+        VEC_TYPE(16)                                                                       \
+        temp0           = vload16(0, (__global DATA_TYPE *)(first_value));                 \
+        DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
+        \
+        left   = temp0.s02468ace;                                                          \
+        middle = temp0.s13579bdf;                                                          \
+        right  = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1);                            \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                                     \
+    ({                                                                                   \
+        VEC_TYPE(16)                                                                     \
+        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                         \
+        VEC_TYPE(8)                                                                      \
+        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
+        \
+        left   = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                       \
+        middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036);                       \
+        right  = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147);                       \
+    })
+#endif /* CONV_STRIDE_X */
+#else  /*DILATION_X==1*/
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                                                  \
+    ({                                                                                                \
+        left   = vload8(0, (__global DATA_TYPE *)(first_value));                                      \
+        middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
+        right  = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                                                   \
+    ({                                                                                                 \
+        VEC_TYPE(16)                                                                                   \
+        temp0  = vload16(0, (__global DATA_TYPE *)(first_value));                                      \
+        left   = temp0.s02468ace;                                                                      \
+        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
+        middle = temp0.s02468ace;                                                                      \
+        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
+        right  = temp0.s02468ace;                                                                      \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                                                        \
+    ({                                                                                                      \
+        VEC_TYPE(16)                                                                                        \
+        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                                            \
+        VEC_TYPE(8)                                                                                         \
+        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))));                   \
+        left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                            \
+        \
+        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));          \
+        temp1  = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE)));    \
+        middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                          \
+        \
+        temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE)));       \
+        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \
+        right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                           \
+    })
+
+#endif /* CONV_STRIDE_X */
+#endif /*DILATION_X==1*/
+/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
+ *
+ * @note Per-channel quantization is not supported by this kernel.
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void dwc_3x3_native_quantized8_dot8_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Vector   output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
+    Vector   output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
+#if defined(HAS_BIAS)
+    Vector    biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
+#endif //defined(HAS_BIAS)
+
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+
+    VEC_TYPE(3)
+    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
+    VEC_TYPE(3)
+    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
+    VEC_TYPE(3)
+    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
+    const int output_shift      = *((__global int *)vector_offset(&output_shifts, 0));
+
+    VEC_TYPE(8)
+    left0, middle0, right0;
+    VEC_TYPE(8)
+    left1, middle1, right1;
+    VEC_TYPE(8)
+    left2, middle2, right2;
+
+    int8 values0 = 0;
+    int8 sum0    = 0;
+
+    GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
+    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
+    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
+
+#if WEIGHTS_OFFSET != 0
+    sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
+    sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
+    sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    // If conv_stride_y is equals to 1, we compute two output rows
+
+    VEC_TYPE(8)
+    left3, middle3, right3;
+    int8 values1 = 0;
+    int8 sum1    = 0;
+
+    GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
+
+#if WEIGHTS_OFFSET != 0
+    sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
+    sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
+    sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
+#endif /* WEIGHTS_OFFSET != 0 */
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
+
+    ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
+    ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
+    values0.s0 += right2.s0 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
+    ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
+    values0.s1 += right2.s1 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
+    ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
+    values0.s2 += right2.s2 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
+    ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
+    values0.s3 += right2.s3 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
+    ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
+    values0.s4 += right2.s4 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
+    ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
+    values0.s5 += right2.s5 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
+    ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
+    values0.s6 += right2.s6 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
+    ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
+    values0.s7 += right2.s7 * w2.s2;
+
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
+    ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
+    values1.s0 += right3.s0 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
+    ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
+    values1.s1 += right3.s1 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
+    ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
+    values1.s2 += right3.s2 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
+    ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
+    values1.s3 += right3.s3 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
+    ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
+    values1.s4 += right3.s4 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
+    ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
+    values1.s5 += right3.s5 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
+    ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
+    values1.s6 += right3.s6 * w2.s2;
+
+    ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
+    ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
+    values1.s7 += right3.s7 * w2.s2;
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
+
+#if defined(HAS_BIAS)
+    values0 += (int8)(bias_value);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += (int8)(bias_value);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
+#endif //defined(HAS_BIAS)
+
+#if WEIGHTS_OFFSET != 0
+    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    WEIGHTS_PROMOTED_TYPE sum_weights = 0;
+    VEC_WEIGHTS_PROMOTED_TYPE(3)
+    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
+    sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
+    values0 += sum_weights * (int8)(INPUT_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += sum_weights * (int8)(INPUT_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    values0 += (int8)(K_OFFSET);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+    values1 += (int8)(K_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
+#endif /* K_OFFSET != 0 */
+
+#if defined(REAL_MULTIPLIER)
+
+    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    values0 += (int8)OUTPUT_OFFSET;
+    VEC_TYPE(8)
+    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
+
+    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
+
+#if defined(REAL_MULTIPLIER)
+
+    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    values1 += (int8)OUTPUT_OFFSET;
+    VEC_TYPE(8)
+    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
+
+    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
+}
+
+#endif // !defined(IS_DOT8)
+
+#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
+
+#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
+
+#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
+
+#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT)
+
+#if WEIGHTS_OFFSET != 0
+#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
+    ({                                          \
+        sum += CONVERT(x, VEC_INT);             \
+        MULTIPLY_ADD(x, y, acc);                \
+    })
+#else /* WEIGHTS_OFFSET != 0 */
+#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
+    ({                                                                                 \
+        ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc);                 \
+        ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc);                 \
+        acc += val8 * w1;                                                              \
+    })
+
+#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
+    ({                                                                                   \
+        sum = val0;                                                                      \
+        ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum);             \
+        ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum);             \
+    })
+
+#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
+    ({                                             \
+        sum = w1;                                  \
+        ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum);    \
+        ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum);    \
+    })
+
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
+ *
+ * @note This kernel assumes VEC_SIZE is 4.
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
+ * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
+ * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ * @param[in] max_offset                                       Max offset for the input tensor
+ */
+__kernel void dwc_3x3_reshaped_quantized8_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    IMAGE_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    int max_offset)
+{
+    const int x = get_global_id(0); // channels
+    const int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int      z                         = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
+
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr           = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
+
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
+
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute VEC_SIZEx1x1 [C,W,H] elements
+    VEC_INT acc = 0, sum = 0;
+
+    // Load weights
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
+    w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
+    w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
+
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w0 = w0_tmp.s0123;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w1 = w0_tmp.s4567;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w2 = w0_tmp.s89AB;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w3 = w0_tmp.sCDEF;
+
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w4 = w1_tmp.s0123;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w5 = w1_tmp.s4567;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w6 = w1_tmp.s89AB;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w7 = w1_tmp.sCDEF;
+
+#if INPUT_OFFSET != 0
+    VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
+                     + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
+                     + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    // Load input values
+    // z == 0
+    // Clamp z_coord as for z = 0, it can be negative
+    // z_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
+    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+
+    VEC_TYPE(VEC_SIZE)
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    // z == 1
+    // z_coord can be only negative for z = 0 so we do not need to clamp it
+    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
+    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    VEC_TYPE(VEC_SIZE)
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    // z == 2
+    // Offset can be out-of-bound so we need to check if it is greater than max_offset
+    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+    VEC_TYPE(VEC_SIZE)
+    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+
+    MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
+
+    MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
+
+    MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
+    MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
+
+#if defined(HAS_BIAS)
+    Vector  biases      = CONVERT_TO_VECTOR_STRUCT(biases);
+    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+    acc += bias_values;
+#endif // defined(HAS_BIAS)
+
+#if WEIGHTS_OFFSET != 0
+    acc += WEIGHTS_OFFSET * sum;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    acc += INPUT_OFFSET * sum_we;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    acc += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+#if defined(REAL_MULTIPLIER)
+
+    acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
+    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
+    VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
+    VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
+
+    VEC_INT res_shift_lt0              = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res_shift_gt0              = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+    acc                                = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc     = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    acc += (VEC_INT)OUTPUT_OFFSET;
+
+    VEC_TYPE(VEC_SIZE)
+    res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE));
+
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
+    VSTORE(VEC_SIZE)
+    (ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr));
+}
+#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
+ *
+ * @note This kernel assumes VEC_SIZE is 4.
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
+ * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
+ * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ * @param[in] max_offset                                       Max offset for the input tensor
+ */
+
+__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    IMAGE_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    int max_offset)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int             z                  = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
+
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr           = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
+
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
+
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute 4x2x2 [C,W,H] elements
+    VEC_INT acc0 = 0, sum0 = 0;
+    VEC_INT acc1 = 0, sum1 = 0;
+    VEC_INT acc2 = 0, sum2 = 0;
+    VEC_INT acc3 = 0, sum3 = 0;
+
+    // Load weights
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
+    w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
+    w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
+
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w0 = w0_tmp.s0123;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w1 = w0_tmp.s4567;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w2 = w0_tmp.s89AB;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w3 = w0_tmp.sCDEF;
+
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w4 = w1_tmp.s0123;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w5 = w1_tmp.s4567;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w6 = w1_tmp.s89AB;
+    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
+    w7 = w1_tmp.sCDEF;
+
+#if INPUT_OFFSET != 0
+    VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
+                     + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
+                     + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    // Load input values
+    // z == 0
+    // Clamp z_coord as for z = 0, it can be negative
+    // z_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+
+    VEC_TYPE(VEC_SIZE)
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 1
+    // z_coord can be only negative for z = 0 so we do not need to clamp it
+    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
+    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    VEC_TYPE(VEC_SIZE)
+    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 2
+    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
+    // However offset can be out-of-bound so we need to check if it is greater than max_offset
+    offset += (int4)src_stride_z;
+    offset = min(offset, (int4)max_offset);
+    VEC_TYPE(VEC_SIZE)
+    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 3
+    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
+    // However offset can be out-of-bound so we need to check if it is greater than max_offset
+    offset += (int4)(src_stride_z);
+    offset = min(offset, (int4)max_offset);
+    VEC_TYPE(VEC_SIZE)
+    values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
+
+    MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
+
+    MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
+    MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
+    MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
+
+    MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
+
+    MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
+
+    MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
+    MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
+    MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
+
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+
+    acc0 += bias_values;
+    acc1 += bias_values;
+    acc2 += bias_values;
+    acc3 += bias_values;
+#endif /* defined(HAS_BIAS) */
+
+#if WEIGHTS_OFFSET != 0
+    acc0 += WEIGHTS_OFFSET * sum0;
+    acc1 += WEIGHTS_OFFSET * sum1;
+    acc2 += WEIGHTS_OFFSET * sum2;
+    acc3 += WEIGHTS_OFFSET * sum3;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    VEC_INT offs = INPUT_OFFSET * sum_we;
+
+    acc0 += offs;
+    acc1 += offs;
+    acc2 += offs;
+    acc3 += offs;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    acc0 += (VEC_INT)K_OFFSET;
+    acc1 += (VEC_INT)K_OFFSET;
+    acc2 += (VEC_INT)K_OFFSET;
+    acc3 += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+#if defined(REAL_MULTIPLIER)
+
+    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
+    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
+    VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
+    VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
+
+    VEC_INT res0_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res1_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res2_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res3_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res0_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+    VEC_INT res1_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+    VEC_INT res2_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+    VEC_INT res3_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+    acc0                     = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
+    acc1                     = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
+    acc2                     = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
+    acc3                     = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc0    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc1    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc2    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc3    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc2    = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc3    = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#endif // defined(REAL_MULTIPLIER)
+
+    acc0 += (VEC_INT)OUTPUT_OFFSET;
+    acc1 += (VEC_INT)OUTPUT_OFFSET;
+    acc2 += (VEC_INT)OUTPUT_OFFSET;
+    acc3 += (VEC_INT)OUTPUT_OFFSET;
+
+    VEC_TYPE(VEC_SIZE)
+    res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));
+    VEC_TYPE(VEC_SIZE)
+    res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));
+    VEC_TYPE(VEC_SIZE)
+    res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE));
+    VEC_TYPE(VEC_SIZE)
+    res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE));
+
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
+    VSTORE(VEC_SIZE)
+    (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
+    VSTORE(VEC_SIZE)
+    (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
+
+#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
+    if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
+#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
+    {
+        VSTORE(VEC_SIZE)
+        (ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+        VSTORE(VEC_SIZE)
+        (ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+    }
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
+ *
+ * @note Per-channel quantization is not supported by this kernel.
+ * @note This kernel assumes VEC_SIZE is 4.
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
+ * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
+ * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
+ * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
+ *       If not, the quantization will be performed using a fixed point multiplication
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ * @param[in] max_offset                                       The maximum allowed offset for the input tensor
+ */
+__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    IMAGE_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif // defined(HAS_BIAS)
+    int max_offset)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
+
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
+
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
+
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute 4x2x1 [C,W,H] elements
+    VEC_INT acc0 = 0;
+    VEC_INT acc1 = 0;
+    VEC_INT sum0 = 0;
+    VEC_INT sum1 = 0;
+
+    // Load weights
+    VEC_TYPE(16)
+    w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
+    VEC_TYPE(16)
+    w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
+    VEC_TYPE(4)
+    w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32));
+
+#if INPUT_OFFSET != 0
+    // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
+
+    // Multiply the weights reduction with INPUT_OFFSET
+    acc0 = INPUT_OFFSET * acc0;
+
+    acc1 = acc0;
+#endif // INPUT_OFFSET != 0
+
+    // Load input values
+    // z == 0
+    // Clamp z_coord as for z = 0, it can be negative
+    // z_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    z_coord = z - (int)CONV_PAD_TOP;
+    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    offset  = min(offset, (int4)max_offset);
+
+    VEC_TYPE(VEC_SIZE)
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 1
+    // z_coord can be only negative for z = 0 so we do not need to clamp it
+    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
+    z_coord = z - (int)CONV_PAD_TOP + 1;
+    offset  = y_offset + (int4)(z_coord * src_stride_z);
+    VEC_TYPE(VEC_SIZE)
+    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    // z == 2
+    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
+    // However offset can be out-of-bound so we need to check if it is greater than max_offset
+    offset += (int4)src_stride_z;
+    offset = min(offset, (int4)max_offset);
+    VEC_TYPE(VEC_SIZE)
+    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_TYPE(VEC_SIZE)
+    values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_TYPE(VEC_SIZE)
+    values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_TYPE(VEC_SIZE)
+    values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+
+    DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
+    DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
+    DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
+    DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
+
+    DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
+    DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
+    DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+    DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+
+    DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
+    DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
+    DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
+    DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
+
+    DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
+    DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
+    DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
+    DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
+
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+
+    acc0 += bias_values;
+    acc1 += bias_values;
+
+#endif // defined(HAS_BIAS)
+
+#if WEIGHTS_OFFSET != 0
+    acc0 += WEIGHTS_OFFSET * sum0;
+    acc1 += WEIGHTS_OFFSET * sum1;
+#endif // WEIGHTS_OFFSET != 0
+
+#if K_OFFSET != 0
+    acc0 += (VEC_INT)K_OFFSET;
+    acc1 += (VEC_INT)K_OFFSET;
+
+#endif // K_OFFSET != 0
+
+#if defined(REAL_MULTIPLIER)
+
+    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
+#if OUTPUT_SHIFT < 0
+    acc0                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc1                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+
+#endif // defined(REAL_MULTIPLIER)
+    acc0 += (VEC_INT)OUTPUT_OFFSET;
+    acc1 += (VEC_INT)OUTPUT_OFFSET;
+
+    VEC_TYPE(VEC_SIZE)
+    res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));
+    VEC_TYPE(VEC_SIZE)
+    res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));
+
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
+    VSTORE(VEC_SIZE)
+    (ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
+
+#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
+
+#endif // defined(WEIGHTS_PROMOTED_TYPE)
+
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
+
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
+/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
+ *
+ * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
+ * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
+ * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
+ * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
+ * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
+ * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
+ * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
+ * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
+ * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
+ * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
+ * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
+ * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
+ */
+__kernel void dwc_MxN_native_quantized8_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+    VECTOR_DECLARATION(output_multipliers),
+    VECTOR_DECLARATION(output_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(HAS_BIAS)
+)
+{
+    int x = get_global_id(0); // channels
+    int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else                                          // defined(DST_DEPTH)
+    int z = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
+
+    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
+
+    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
+
+    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+
+#if defined(HAS_BIAS)
+    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
+#endif // defined(HAS_BIAS)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#if defined(DST_DEPTH)
+    s_addr += b * src_stride_w;
+    d_addr += b * dst_stride_w;
+#endif // defined(DST_DEPTH)
+
+#if DEPTH_MULTIPLIER > 1
+    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
+    {
+#endif // DEPTH_MULTIPLIER > 1
+        // Each work-item computes N0x1x1 elements
+        VEC_INT res = 0;
+
+        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
+        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
+
+        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+        {
+            if(y_coord >= 0 && y_coord < SRC_DIM2)
+            {
+                int x_coord_tmp = x_coord;
+
+                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+                {
+                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
+                    {
+                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
+                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
+
+                        // Load input and weights values
+                        VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);
+                        VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);
+
+                        res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);
+                    }
+                    x_coord_tmp += DILATION_X;
+                }
+            }
+            y_coord += DILATION_Y;
+        }
+
+#if defined(HAS_BIAS)
+        VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));
+        res += bias;
+#endif // defined(HAS_BIAS)
+
+#if defined(PER_CHANNEL_QUANTIZATION)
+        VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
+        VEC_INT output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+
+        VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
+        VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
+        res                   = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+        res   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#else  // OUTPUT_SHIFT < 0
+        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+        res += (VEC_INT)OUTPUT_OFFSET;
+
+        VEC_TYPE(VEC_SIZE)
+        res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
+
+        VSTORE(N0)
+        (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr));
+
+#if DEPTH_MULTIPLIER > 1
+        w_addr += sizeof(WEIGHTS_TYPE);
+        d_addr += sizeof(DATA_TYPE);
+#if defined(PER_CHANNEL_QUANTIZATION)
+        out_mul_addr += sizeof(int);
+        out_shift_addr += sizeof(int);
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+#if defined(HAS_BIAS)
+        b_addr += sizeof(int);
+#endif // defined(HAS_BIAS)
+    }
+#endif // DEPTH_MULTIPLIER > 1
+}
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
+#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/dequantization_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/dequantization_layer.clembed
new file mode 100644
index 0000000..68c1248
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/dequantization_layer.clembed

@@ -0,0 +1,755 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = scale[get_global_id(2)];
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/derivative.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/derivative.clembed
new file mode 100644
index 0000000..191cf4c
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/derivative.clembed

@@ -0,0 +1,623 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This OpenCL kernel that computes the first-order derivative.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void derivative(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif /* GRAD_Y */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+#ifdef GRAD_X
+    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
+    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
+    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
+    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
+    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/dilate.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/dilate.clembed
new file mode 100644
index 0000000..006d715
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/dilate.clembed

@@ -0,0 +1,599 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function dilates an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void dilate(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution1x1.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution1x1.clembed
new file mode 100644
index 0000000..77a9a1b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution1x1.clembed

@@ -0,0 +1,975 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(DATA_LAYOUT_NHWC)
+
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
+
+/** This kernel performs a direct convolution to convolve the low three dimensions of a tensor with data layout NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values        = 0;
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+    weights.ptr += id0 * weights_stride_w;
+    __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + id2 * STRIDE_Y * (int)src_stride_z;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+#if STRIDE_X == 1
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
+                   PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 1 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 3 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 5 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 7 * src_stride_y, DATA_TYPE));
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
+                   PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 8 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 10 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 12 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(src_addr + 14 * src_stride_y, DATA_TYPE));
+#else               /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+        values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, col0));
+
+        src_addr += src_stride_x;
+        weights.ptr += weights_stride_x;
+    }
+
+#ifdef HAS_BIAS
+    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));
+#endif /* defined(HAS_BIAS) */
+
+    *((__global DATA_TYPE *)dst.ptr)                      = values.s0;
+    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;
+    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;
+    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;
+    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;
+    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;
+    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;
+    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
+}
+#endif // defined(DATA_LAYOUT_NHWC)
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
+#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
+#elif STRIDE_X == 2
+#define INPUT_PIXEL(data_size) extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL(data_size) extract_input_stride1
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X == 3 */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
+{
+    return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp = vload16(0, input_pixel);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp1 = vload4(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp2 = vload4(0, input_pixel + 6);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp3 = vload4(0, input_pixel + 12);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp4 = vload4(0, input_pixel + 18);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp1 = vload8(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp2 = vload8(0, input_pixel + 8);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp3 = vload8(0, input_pixel + 16);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp1 = vload16(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp2 = vload16(0, input_pixel + 12);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values = 0;
+
+    const uint z_index = get_global_id(2);
+
+    weights.ptr += z_index * weights_stride_w;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
+        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
+        src.ptr += src_stride_z;
+        weights.ptr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
+    ({                                                 \
+        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
+        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
+        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
+        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
+    })
+
+/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 acc0 = 0.0f;
+    float4 acc1 = 0.0f;
+    float4 acc2 = 0.0f;
+    float4 acc3 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float weight = *((__global float *)weights_addr);
+
+        // Load values from row0 of input tensor
+        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
+        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
+        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
+        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    acc0.s0 += bias;
+    acc0.s1 += bias;
+    acc0.s2 += bias;
+    acc0.s3 += bias;
+    acc1.s0 += bias;
+    acc1.s1 += bias;
+    acc1.s2 += bias;
+    acc1.s3 += bias;
+    acc2.s0 += bias;
+    acc2.s1 += bias;
+    acc2.s2 += bias;
+    acc2.s3 += bias;
+    acc3.s0 += bias;
+    acc3.s1 += bias;
+    acc3.s2 += bias;
+    acc3.s3 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution3x3.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution3x3.clembed
new file mode 100644
index 0000000..948afd0
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution3x3.clembed

@@ -0,0 +1,1013 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
+    ({                                                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
+        src0 = vload8(0, src_row_ptr);                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
+        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
+    ({                                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
+        src0           = vload16(0, src_row_ptr);                                                                                               \
+        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+#if defined(DATA_LAYOUT_NHWC)
+
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x3_STRIDE_NHWC_STRIDE1(acc, row_ptr, weights_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x3_STRIDE_NHWC_STRIDE2(acc, row_ptr, weights_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE_NHWC_STRIDE1(acc, row_ptr, weights_ptr)                                                                      \
+    {                                                                                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                        \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                              \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                        \
+        src1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(                                                                                              \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE),                                                                           \
+                PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE));                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                        \
+        weights = (VEC_DATA_TYPE(DATA_TYPE, 3))(                                                                                           \
+                  PTR_TO_VALUE((weights_ptr) + 0 * weights_stride_y, DATA_TYPE),                                                                 \
+                  PTR_TO_VALUE((weights_ptr) + 1 * weights_stride_y, DATA_TYPE),                                                                 \
+                  PTR_TO_VALUE((weights_ptr) + 2 * weights_stride_y, DATA_TYPE));                                                                \
+        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s0));                                                          \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s1)); \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s2)); \
+    }
+
+#define CONVOLUTION1x3_STRIDE_NHWC_STRIDE2(acc, row_ptr, weights_ptr)                                                                   \
+    {                                                                                                                                   \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                    \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                                                                        \
+                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE),                                                                       \
+                PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                                                                       \
+                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE),                                                                       \
+                PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                                                                       \
+                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE),                                                                       \
+                PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));                                                                      \
+        DATA_TYPE src1 = PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE);                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                     \
+        weights = (VEC_DATA_TYPE(DATA_TYPE, 3))(                                                                                        \
+                  PTR_TO_VALUE((weights_ptr) + 0 * weights_stride_y, DATA_TYPE),                                                              \
+                  PTR_TO_VALUE((weights_ptr) + 1 * weights_stride_y, DATA_TYPE),                                                              \
+                  PTR_TO_VALUE((weights_ptr) + 2 * weights_stride_y, DATA_TYPE));                                                             \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0.s02468ACE, (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s0));                                             \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s1));      \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s2)); \
+    }
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values0       = 0;
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
+
+    weights_addr += id0 * weights_stride_w;
+
+    const int coordy = ((id2 * STRIDE_Y) - PAD_TOP);
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+#if PAD_TOP > 0
+        if(coordy < 0) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - PAD_TOP - 1))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
+            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
+            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
+        }
+#else  // PAD_TOP > 0
+        CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
+        CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
+        CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
+#endif // PAD_TOP > 0
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    values0       = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));
+#endif /* defined(HAS_BIAS) */
+
+    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;
+    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;
+    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;
+    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;
+    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;
+    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;
+    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;
+    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;
+}
+#endif // defined(DATA_LAYOUT_NHWC)
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
+    ({                                                        \
+        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
+        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
+        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
+        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
+        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
+        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
+        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
+        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
+        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
+        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
+        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
+        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
+    })
+
+/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 values0 = 0;
+    float4 values1 = 0;
+    float4 values2 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        float4 src0;
+        float2 src1;
+
+        // Load values from row0 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
+
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
+
+        // Load values from row1 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
+
+        // Load values from row2 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
+
+        // Load values from row3 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
+
+        // Row4
+        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    values0 += (float4)bias;
+    values1 += (float4)bias;
+    values2 += (float4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution5x5.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution5x5.clembed
new file mode 100644
index 0000000..3f6a856
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution5x5.clembed

@@ -0,0 +1,1092 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
+        src0 = vload8(0, src_row_ptr);                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
+        \
+        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
+        src0 = vload16(0, src_row_ptr);                                                                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
+        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+#if defined(DATA_LAYOUT_NHWC)
+
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)                                                                         \
+    ({                                                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
+        src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
+        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                               \
+                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
+        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE);                                        \
+        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                                 \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
+    })
+
+#define CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)                                                                         \
+    ({                                                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                   \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                         \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
+        src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
+        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                               \
+                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
+        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE);                                        \
+        acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                       \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;                \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
+        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
+    })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    values0 = 0;
+
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
+
+    weights_addr += id0 * weights_stride_w;
+
+#if(PAD_TOP == 1)
+    const int coordy = id2 - PAD_TOP;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        if(coordy < 0) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        else if(coordy == (DST_HEIGHT - PAD_TOP - 1))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+#elif(PAD_TOP == 2)
+    const int coordy = id2 * STRIDE_Y;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        if(coordy == 0) // special case Z = -2 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        else if(coordy == 1) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 1))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 2))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+
+#else  /*  PAD_TOP == 2 */
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+        CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+        CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+        CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+        CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+#endif /*  PAD_TOP == 1 */
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));
+#endif /* defined(HAS_BIAS) */
+
+    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;
+    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;
+    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;
+    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;
+    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;
+    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;
+    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;
+    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;
+}
+
+#endif // defined(DATA_LAYOUT_NHWC)
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    values0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
+    ({                                                                  \
+        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
+        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
+        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
+        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
+        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
+        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
+        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
+        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
+        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
+        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
+        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
+        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
+        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
+        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
+        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
+        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
+        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
+        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
+        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
+        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
+    })
+
+/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 values0 = 0.0f;
+    float4 values1 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights from row0 and row1
+        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
+        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
+        float8 src0;
+
+        // Load values from row0 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+
+        // Load values from row1 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        // Load values from row2 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+        // Load weights from row2
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+        // Load values from row3 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        // Load weights from row3
+        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
+        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        // Load values from row4 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+
+        // Load weights from row4
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
+
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+        // Load values from row5 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    values0 += bias;
+    values1 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution9x9.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution9x9.clembed
new file mode 100644
index 0000000..023e3ba
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution9x9.clembed

@@ -0,0 +1,989 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC)
+
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
+
+#define CONVOLUTION1x9_STRIDE1_NHWC(acc, row_ptr, weights_ptr)                                                                         \
+    ({                                                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                 \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        src1 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                               \
+                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 5 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 6 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 7 * weights_stride_y, DATA_TYPE)); \
+        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 8 * weights_stride_y, DATA_TYPE);                                        \
+        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                                 \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s4567, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s4;                \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s567, src1.s0123, src1.s4) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s5;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s67, src1.s012, src1.s345) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s6;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s7, src1.s0123, src1.s456) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s7;        \
+        acc += src1 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;                                                                     \
+    })
+
+#define CONVOLUTION1x9_STRIDE2_NHWC(acc, row_ptr, weights_ptr)                                                                         \
+    ({                                                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                   \
+        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                         \
+                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
+                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        src1 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
+                PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 20 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 21 * src_stride_y, DATA_TYPE),                \
+                PTR_TO_VALUE(row_ptr + 22 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 23 * src_stride_y, DATA_TYPE));               \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
+        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                               \
+                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 5 * weights_stride_y, DATA_TYPE),  \
+                          PTR_TO_VALUE(weights_ptr + 6 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 7 * weights_stride_y, DATA_TYPE)); \
+        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 8 * weights_stride_y, DATA_TYPE);                                        \
+        acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                       \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;                \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468A, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s4;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s579, src0.sBDF, src1.s13) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s5;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s68A, src0.sCE, src1.s024) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s6;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s79B, src0.sDF, src1.s135) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s7;        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s8AC, src0.sE, src1.s0246) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
+    })
+
+#if defined(VEC_SIZE)
+#define VFMA(acc, w, src0, src1, src2, src3, src4, src5, src6, src7) \
+    ({                                                               \
+        acc##0 = fma(src0, w, acc##0);                               \
+        acc##1 = fma(src1, w, acc##1);                               \
+        acc##2 = fma(src2, w, acc##2);                               \
+        acc##3 = fma(src3, w, acc##3);                               \
+        acc##4 = fma(src4, w, acc##4);                               \
+        acc##5 = fma(src5, w, acc##5);                               \
+        acc##6 = fma(src6, w, acc##6);                               \
+        acc##7 = fma(src7, w, acc##7);                               \
+    })
+
+#define CONVOLUTION1x9_STRIDE1_NHWC_BIFROST(acc, row_ptr, weights_ptr)                       \
+    ({                                                                                       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)row_ptr);                            \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + src_stride_y));           \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 2 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 3 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 4 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 5 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 6 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 7 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 8 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 9 * src_stride_y));       \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 10 * src_stride_y));     \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 11 * src_stride_y));     \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 12 * src_stride_y));     \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 13 * src_stride_y));     \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 14 * src_stride_y));     \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        src15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 15 * src_stride_y));     \
+        \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 0 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 1 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 2 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 3 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 4 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 5 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 6 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 7 * weights_stride_y)); \
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
+        w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 8 * weights_stride_y)); \
+        \
+        VFMA(acc, w0, src0, src1, src2, src3, src4, src5, src6, src7);                       \
+        VFMA(acc, w1, src1, src2, src3, src4, src5, src6, src7, src8);                       \
+        VFMA(acc, w2, src2, src3, src4, src5, src6, src7, src8, src9);                       \
+        VFMA(acc, w3, src3, src4, src5, src6, src7, src8, src9, src10);                      \
+        VFMA(acc, w4, src4, src5, src6, src7, src8, src9, src10, src11);                     \
+        VFMA(acc, w5, src5, src6, src7, src8, src9, src10, src11, src12);                    \
+        VFMA(acc, w6, src6, src7, src8, src9, src10, src11, src12, src13);                   \
+        VFMA(acc, w7, src7, src8, src9, src10, src11, src12, src13, src14);                  \
+        VFMA(acc, w8, src8, src9, src10, src11, src12, src13, src14, src15);                 \
+    })
+
+#if VEC_SIZE == 4
+#define REDUCE(out, vec)            \
+    ({                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        tmp1 = vec.s01 + vec.s23;   \
+        out  = tmp1.s0 + tmp1.s1;   \
+    })
+#else // VEC_SIZE == 4
+#error("Not supported")
+#endif // VEC_SIZE == 4
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1_NHWC_BIFROST(acc, row_ptr, weights_ptr)
+#else // STRIDE_X == 1
+#error "Not supported"
+#endif // STRIDE_X == 1
+
+#else // defined(VEC_SIZE)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1_NHWC(acc, row_ptr, weights_ptr)
+#elif STRIDE_X == 2 // STRIDE_X == 1
+#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE2_NHWC(acc, row_ptr, weights_ptr)
+#else // STRIDE_X == 1
+#error "STRIDE_X larger than 2 is not supported"
+#endif // STRIDE_X == 1
+
+#endif // defined(VEC_SIZE)
+
+//#if defined(VEC_SIZE)
+/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            (Optional) Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      (Optional) Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    values = 0;
+
+#if defined(VEC_SIZE)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values0 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values1 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values2 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values3 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values4 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values5 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values6 = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values7 = 0;
+#define STEP_X (VEC_SIZE)
+#else // defined(VEC_SIZE)
+#define STEP_X (1)
+#endif // defined(VEC_SIZE)
+
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
+
+    weights_addr += id0 * weights_stride_w;
+
+#if(PAD_TOP == 1)
+    const int coordy = id2 - PAD_TOP;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    {
+        if(coordy < 0) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+        else if(coordy == (DST_HEIGHT - PAD_TOP - 1))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+        src_addr += STEP_X * sizeof(DATA_TYPE);
+        weights_addr += STEP_X * sizeof(DATA_TYPE);
+    }
+#elif(PAD_TOP == 2) // PAD_TOP == 1
+    const int coordy = id2 * STRIDE_Y;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    {
+        if(coordy == 0) // special case Z = -2 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+        else if(coordy == 1) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 5))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 6))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+        src_addr += STEP_X * sizeof(DATA_TYPE);
+        weights_addr += STEP_X * sizeof(DATA_TYPE);
+    }
+
+#else  // PAD_TOP == 1
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    {
+        CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+        CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+        CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        src_addr += STEP_X * sizeof(DATA_TYPE);
+        weights_addr += STEP_X * sizeof(DATA_TYPE);
+    }
+#endif // PAD_TOP == 1
+
+#if defined(VEC_SIZE)
+    REDUCE(values.s0, values0);
+    REDUCE(values.s1, values1);
+    REDUCE(values.s2, values2);
+    REDUCE(values.s3, values3);
+    REDUCE(values.s4, values4);
+    REDUCE(values.s5, values5);
+    REDUCE(values.s6, values6);
+    REDUCE(values.s7, values7);
+#endif // defined(VEC_SIZE)
+
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    values += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));
+#endif // defined(HAS_BIAS)
+
+    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values.s0;
+    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;
+    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;
+    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;
+    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;
+    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;
+    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;
+    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
+#undef STEP_X
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution_quantized.clembed
new file mode 100644
index 0000000..50bc676
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/direct_convolution_quantized.clembed

@@ -0,0 +1,1336 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#if KERNEL_SIZE == 9
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                            \
+    ({                                                                                                       \
+        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                    \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                         \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                      \
+        acc += (src0.lo + input_offset) * ((int8)weights_values0.s0 + weight_offset);                        \
+        acc += ((int8)(src0.s1234, src0.s5678) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
+        acc += ((int8)(src0.s2345, src0.s6789) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s3456, src0.s789A) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s4567, src0.s89AB) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \
+        acc += ((int8)(src0.s5678, src0.s9ABC) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \
+        acc += ((int8)(src0.s6789, src0.sABCD) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \
+        acc += ((int8)(src0.s789A, src0.sBCDE) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \
+        acc += ((int8)(src0.s89AB, src0.sCDEF) + input_offset) * ((int8)weights_value1 + weight_offset);     \
+    })
+
+#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                            \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                 \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                              \
+        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                           \
+        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
+        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \
+        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \
+        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \
+        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \
+        acc += ((int8)(src0.s8ACE, src1.s0246) + input_offset) * ((int8)weights_value1 + weight_offset);             \
+    })
+
+#elif KERNEL_SIZE == 5
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
+        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
+        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                             \
+        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset);     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                            \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                 \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                              \
+        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                           \
+        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
+        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset);     \
+    })
+
+#elif KERNEL_SIZE == 3
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                             \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
+        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                             \
+        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                 \
+    ({                                                                                                            \
+        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                         \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                           \
+        int   src1            = convert_int(*(src_row_ptr + 16));                                                 \
+        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                           \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);      \
+        acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+    })
+
+#elif KERNEL_SIZE == 1
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL extract_input_stride3
+#elif STRIDE_X == 2
+#define INPUT_PIXEL extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL extract_input_stride1
+
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride1(__global const uchar *input_pixel)
+{
+    return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride2(__global const uchar *input_pixel)
+{
+    uchar16 temp = vload16(0, input_pixel);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
+{
+    uchar16 temp1 = vload16(0, input_pixel);
+    uchar16 temp2 = vload16(0, input_pixel + 12);
+    return (uchar8)(temp1.s0369, temp2.s0369);
+}
+
+#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
+#error "Only kernel sizes 1, 3, 5 and 9 are supported"
+#endif /* KERNEL_SIZE */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  input_offset                          Input offset quantization parameter
+ * @param[in]  weight_offset                         Weights offset quantization parameter
+ * @param[in]  output_offset                         Output offset quantization parameter
+ */
+__kernel void direct_convolution_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w,
+    int          input_offset,
+    int          weight_offset,
+    int          output_offset)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int8 pixels0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+#if KERNEL_SIZE == 9
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 0 * src_stride_y), (__global uchar *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 3 * src_stride_y), (__global uchar *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 4 * src_stride_y), (__global uchar *)(weights_addr + 4 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 5 * src_stride_y), (__global uchar *)(weights_addr + 5 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 6 * src_stride_y), (__global uchar *)(weights_addr + 6 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 7 * src_stride_y), (__global uchar *)(weights_addr + 7 * weights_stride_y));
+        CONVOLUTION1x9(pixels0, (__global uchar *)(src_addr + 8 * src_stride_y), (__global uchar *)(weights_addr + 8 * weights_stride_y));
+#elif KERNEL_SIZE == 5
+        CONVOLUTION1x5(pixels0, (__global uchar *)src_addr, (__global uchar *)weights_addr);
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 3 * src_stride_y), (__global uchar *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 4 * src_stride_y), (__global uchar *)(weights_addr + 4 * weights_stride_y));
+#elif KERNEL_SIZE == 3
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 0 * src_stride_y), (__global uchar *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
+#elif KERNEL_SIZE == 1
+        int weight       = convert_int(*(__global uchar *)weights_addr);
+        int8 input_pixel = convert_int8(INPUT_PIXEL((__global uchar *)src_addr));
+        pixels0 += (input_pixel + input_offset) * ((int8)weight + weight_offset);
+#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
+    pixels0 += (int8)(*bias_addr);
+#endif /* defined(HAS_BIAS) */
+
+#if OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
+    pixels0 = pixels0 + output_offset;
+
+    vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation.clembed
new file mode 100644
index 0000000..6a11ccc
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation.clembed

@@ -0,0 +1,643 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define DIV(x, y) (x / y)
+#define POWER(x, y) pow(x, y)
+#define PRELU(x, y) (select(y * x, x, x > (DATA_TYPE_OUT)0))
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (OP(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation_quantized.clembed
new file mode 100644
index 0000000..c190ab5
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_operation_quantized.clembed

@@ -0,0 +1,653 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define PRELU(x, y) (select(y * x, x, x > (DATA_TYPE_OUT)0))
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @attention The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @attention The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @attention The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @attention The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @attention The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @attention For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
+ * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_TYPE  res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_unary.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_unary.clembed
new file mode 100644
index 0000000..6f2cbd9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/elementwise_unary.clembed

@@ -0,0 +1,1311 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+// Calculate exponential
+#define exp_op(input) exp(input)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+// Calculate negative
+#define neg_op(input) (-input)
+// Calculate sine
+#define sin_op(input) sin(input)
+// Calculate abs for floating point values
+#define fabs_op(input) fabs(input)
+// Calculate natural_log
+#define natural_log_op(input) log(input)
+// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
+#if defined(VEC_SIZE)
+#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#else // defined(VEC_SIZE
+#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
+#endif // defined(VEC_SIZE
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/erode.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/erode.clembed
new file mode 100644
index 0000000..b11c156
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/erode.clembed

@@ -0,0 +1,599 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function erodes an input image image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void erode(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/fast_corners.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/fast_corners.clembed
new file mode 100644
index 0000000..d125cd4
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/fast_corners.clembed

@@ -0,0 +1,860 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
+
+/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+ *
+ *      . . F 0 1 . . .
+ *      . E . . . 2 . .
+ *      D . . . . . 3 .
+ *      C . . P . . 4 .
+ *      B . . . . . 5 .
+ *      . A . . . 6 . .
+ *      . . 9 8 7 . . .
+ */
+constant int offsets_s[16][2] =
+{
+    { 0, -3 },  // 0
+    { 1, -3 },  // 1
+    { 2, -2 },  // 2
+    { 3, -1 },  // 3
+    { 3, 0 },   // 4
+    { 3, 1 },   // 5
+    { 2, 2 },   // 6
+    { 1, 3 },   // 7
+    { 0, 3 },   // 8
+    { -1, 3 },  // 9
+    { -2, 2 },  // A
+    { -3, 1 },  // B
+    { -3, 0 },  // C
+    { -3, -1 }, // D
+    { -2, -2 }, // E
+    { -1, -3 }, // F
+};
+
+/** Load a pixel and set the mask values.
+ *
+ * @param[in]  ptr         The pointer to the starting address of source image
+ * @param[in]  a           Index to indicate the position in the Bresenham circle
+ * @param[in]  stride      Stride of source image in x dimension
+ * @param[in]  dark        The left end of the threshold range
+ * @param[in]  bright      The right end of the threshold range
+ * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
+ * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
+ *
+ */
+#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
+    {                                                                           \
+        unsigned char pixel;                                                    \
+        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
+        dark_mask |= (pixel < dark) << a;                                       \
+        bright_mask |= (pixel > bright) << a;                                   \
+    }
+
+/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
+ *
+ * @param[in]  bright_mask The mask recording postions of bright pixels
+ * @param[in]  dark_mask   The mask recording postions of dark pixels
+ * @param[out] isCorner    Indicate whether candidate pixel is corner
+ */
+#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
+    {                                                     \
+        for(int i = 0; i < 16; i++)                       \
+        {                                                 \
+            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
+            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
+            if(isCorner)                                  \
+            {                                             \
+                break;                                    \
+            }                                             \
+            bright_mask >>= 1;                            \
+            dark_mask >>= 1;                              \
+        }                                                 \
+    }
+
+/* Calculate pixel's strength */
+uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
+{
+    short a = threshold;
+    short b = 255;
+    while(b - a > 1)
+    {
+        uchar        c           = convert_uchar_sat((a + b) / 2);
+        unsigned int bright_mask = 0;
+        unsigned int dark_mask   = 0;
+
+        unsigned char p_bright = add_sat(candidate_pixel, c);
+        unsigned char p_dark   = sub_sat(candidate_pixel, c);
+
+        bool isCorner = 0;
+
+        for(uint i = 0; i < 16; i++)
+        {
+            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
+        }
+
+        bright_mask |= (bright_mask << 16);
+        dark_mask |= (dark_mask << 16);
+        CHECK_CORNER(bright_mask, dark_mask, isCorner);
+
+        if(isCorner)
+        {
+            a = convert_short(c);
+        }
+        else
+        {
+            b = convert_short(c);
+        }
+    }
+    return a;
+}
+
+/** Fast corners implementation. Calculates and returns the strength of each pixel.
+ *
+ * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
+ * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
+ * from the LSB are set.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  threshold_value                      Threshold value.
+ *
+ */
+__kernel void fast_corners(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output),
+    float threshold_value)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    const unsigned char threshold = (uchar)threshold_value;
+
+    unsigned int bright_mask = 0;
+    unsigned int dark_mask   = 0;
+
+    unsigned char isCorner = 0;
+
+    unsigned char p        = *in.ptr;
+    unsigned char p_bright = add_sat(p, threshold);
+    unsigned char p_dark   = sub_sat(p, threshold);
+
+    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    if(((bright_mask | dark_mask) & 0x1111) == 0)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    bright_mask |= (bright_mask << 16);
+    dark_mask |= (dark_mask << 16);
+
+    CHECK_CORNER(bright_mask, dark_mask, isCorner)
+
+    if(!isCorner)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+#ifdef USE_MAXSUPPRESSION
+    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
+#else  /* USE_MAXSUPPRESSION */
+    *out.ptr = 1;
+#endif /* USE_MAXSUPPRESSION */
+}
+
+/** Copy result to Keypoint buffer and count number of corners
+ *
+ * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
+ * @param[out] offset                              The number of skipped pixels in x dimension
+ * @param[out] num_of_points                       Number of points found
+ * @param[out] out                                 The keypoints found
+ *
+ */
+__kernel void copy_to_keypoint(
+    IMAGE_DECLARATION(input),
+    uint     max_num_points,
+    uint     offset,
+    __global uint *num_of_points,
+    __global Keypoint *out)
+{
+#ifndef UPDATE_NUMBER
+    if(*num_of_points >= max_num_points)
+    {
+        return;
+    }
+#endif /* UPDATE_NUMBER */
+
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uchar value = *in.ptr;
+
+    if(value > 0)
+    {
+        int id = atomic_inc(num_of_points);
+        if(id < max_num_points)
+        {
+            out[id].strength        = value;
+            out[id].x               = get_global_id(0) + offset;
+            out[id].y               = get_global_id(1) + offset;
+            out[id].tracking_status = 1;
+            out[id].scale           = 0.f;
+            out[id].orientation     = 0.f;
+            out[id].error           = 0.f;
+        }
+    }
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/fft.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/fft.clembed
new file mode 100644
index 0000000..1f415c9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/fft.clembed

@@ -0,0 +1,2314 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in]     phi   The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
+    {                                              \
+        float2 w, tmp;                             \
+        w.x   = native_cos(phi);                   \
+        w.y   = native_sin(phi);                   \
+        tmp.x = (w.x * input.x) - (w.y * input.y); \
+        tmp.y = (w.x * input.y) + (w.y * input.x); \
+        input = tmp;                               \
+    }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1) \
+    {                 \
+        float2 v0;    \
+        v0 = c0;      \
+        c0 = v0 + c1; \
+        c1 = v0 - c1; \
+    }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2)                                  \
+    {                                                      \
+        float2 v0 = c1 + c2;                               \
+        float2 v1 = c1 - c2;                               \
+        c1.x      = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y      = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x      = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y      = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0        = c0 + v0;                               \
+    }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3)  \
+    {                          \
+        float2 v0, v1, v2, v3; \
+        v0   = c0 + c2;        \
+        v1   = c1 + c3;        \
+        v2   = c0 - c2;        \
+        v3.x = c1.y - c3.y;    \
+        v3.y = c3.x - c1.x;    \
+        c0   = v0 + v1;        \
+        c2   = v0 - v1;        \
+        c1   = v2 + v3;        \
+        c3   = v2 - v3;        \
+    }
+
+// radix-5 butterfly unit factors
+#define W5_A 0.30901699437494f
+#define W5_B 0.95105651629515f
+#define W5_C 0.80901699437494f
+#define W5_D 0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4)                 \
+    {                                             \
+        float2 v0, v1, v2, v3, v4;                \
+        v0 = c0;                                  \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
+        c0 = v0 + c1 + c2 + c3 + c4;              \
+        c1 = v0 + v1 + (float2)(v4.y, -v4.x);     \
+        c2 = v0 - v2 + (float2)(v3.y, -v3.x);     \
+        c3 = v0 - v2 + (float2)(-v3.y, v3.x);     \
+        c4 = v0 + v1 + (float2)(-v4.y, v4.x);     \
+    }
+
+// radix-7 butterfly unit factors
+#define W7_A 0.62348980185873f
+#define W7_B 0.78183148246802f
+#define W7_C 0.22252093395631f
+#define W7_D 0.97492791218182f
+#define W7_E 0.90096886790241f
+#define W7_F 0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
+    {                                                                \
+        float2 v0, v1, v2, v3, v4, v5, v6;                           \
+        v0 = c0;                                                     \
+        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
+        c1 = v0 + v1 + (float2)(v4.y, -v4.x);                        \
+        c2 = v0 - v2 + (float2)(v5.y, -v5.x);                        \
+        c3 = v0 - v3 + (float2)(v6.y, -v6.x);                        \
+        c4 = v0 - v3 + (float2)(-v6.y, v6.x);                        \
+        c5 = v0 - v2 + (float2)(-v5.y, v5.x);                        \
+        c6 = v0 + v1 + (float2)(-v4.y, v4.x);                        \
+    }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7)  \
+    {                                          \
+        float2 v0, v1, v2, v3, v4, v5, v6, v7; \
+        float2 s0, s1, s2, s3, s4, s5, s6, s7; \
+        float2 t0, t1, t2;                     \
+        v0   = c0 + c4;                        \
+        v1   = c1 + c5;                        \
+        v2   = c2 + c6;                        \
+        v3   = c3 + c7;                        \
+        v4   = c0 - c4;                        \
+        v5   = c1 - c5;                        \
+        v6   = c2 - c6;                        \
+        v7   = c3 - c7;                        \
+        s0   = v0 + v2;                        \
+        s1   = v1 + v3;                        \
+        s2   = v0 - v2;                        \
+        s3   = v1 - v3;                        \
+        s4.x = v4.x - v6.y;                    \
+        s4.y = v4.y + v6.x;                    \
+        s5.x = v5.x - v7.y;                    \
+        s5.y = v5.y + v7.x;                    \
+        s6.x = v4.x + v6.y;                    \
+        s6.y = v4.y - v6.x;                    \
+        s7.x = v5.x + v7.y;                    \
+        s7.y = v5.y - v7.x;                    \
+        t0.x = -s3.y;                          \
+        t0.y = s3.x;                           \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);    \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);    \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);   \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);    \
+        c0   = s0 + s1;                        \
+        c1   = s6 - t2;                        \
+        c2   = s2 - t0;                        \
+        c3   = s4 - t1;                        \
+        c4   = s0 - s1;                        \
+        c5   = s6 + t2;                        \
+        c6   = s2 + t0;                        \
+        c7   = s4 + t1;                        \
+    }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float4 data = vload4(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 2
+    DFT_2(data.s01, data.s23);
+
+    // Store two complex output values
+    vstore4(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 data1 = vload2(0, (__global float *)input.ptr);
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Compute DFT N = 2
+    DFT_2(data1, data2);
+
+    // Store two complex output values
+    vstore2(data1, 0, (__global float *)output.ptr);
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float4 data0 = vload4(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0.s01, data0.s23, data1.s01);
+
+    // Store three complex output values
+    vstore4(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0, data1, data2);
+
+    // Store three complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float8 data = vload8(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 4
+    DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+    // Store four complex output values
+    vstore8(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+
+    // Compute DFT N = 4
+    DFT_4(data0, data1, data2, data3);
+
+    // Store four complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float8 data0 = vload8(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+    // Store five complex output values
+    vstore8(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0, data1, data2, data3, data4);
+
+    // Store five complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float8 data0 = vload8(0, (__global float *)input.ptr);
+    float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+    // Store seven complex output values
+    vstore8(data0, 0, (__global float *)output.ptr);
+    vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+    // Store seven complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float16 data = vload16(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 8
+    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+    // Store eight complex output values
+    vstore16(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+    float2 data7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7, 0));
+
+    // Compute DFT N = 8
+    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+    // Store eight complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global float *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_2_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_2_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_3_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_3_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_4_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_4_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_5_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_5_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_7_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_7_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_8_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_8_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/fft_digit_reverse.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/fft_digit_reverse.clembed
new file mode 100644
index 0000000..3355643
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/fft_digit_reverse.clembed

@@ -0,0 +1,691 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(VEC_SIZE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+    const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+    // Load data
+#if VEC_SIZE == 1
+    float data = *((__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+    float2 res  = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+    // Load data
+#if VEC_SIZE == 1
+    float data = *((__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+    float2 res  = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/fft_scale.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/fft_scale.clembed
new file mode 100644
index 0000000..768a2e7
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/fft_scale.clembed

@@ -0,0 +1,621 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Computes the fft scale stage
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in]  scale                             Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+    TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+    ,
+    float scale)
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+    Tensor3D dst = src;
+#else  /* IN_PLACE */
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+    // Store result
+#if VEC_SIZE == 1
+    *((__global float *)dst.ptr) = (*(__global float *)src.ptr) / scale;
+#elif VEC_SIZE == 2
+    // Load data
+    float2 data = vload2(0, (__global float *)src.ptr);
+    data /= scale;
+#if defined(CONJ)
+    vstore2((float2)(data.s0, -data.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(data, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+#else  // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/fill_border.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/fill_border.clembed
new file mode 100644
index 0000000..f0081cc
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/fill_border.clembed

@@ -0,0 +1,708 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    TENSOR3D_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    TENSOR3D_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/flatten.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/flatten.clembed
new file mode 100644
index 0000000..8baeac6
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/flatten.clembed

@@ -0,0 +1,619 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH)
+
+/** This opencl kernel flattens the first 3 dimensions of the input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The width, height and depth of the input tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24, -DSRC_DEPTH=16
+ * @note If the output has 3 dimensions, the 2nd dimension of the output tensor must be passed at compile time using -DDST_DIM1. e.g -DDST_DIM1=3
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void flatten(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+
+    uint c  = get_global_id(2) % SRC_DEPTH; // input feature map
+    uint b0 = get_global_id(2) / SRC_DEPTH; // batch id
+    uint b1 = 0;
+
+#if defined(DST_DIM1)
+    uint b_tmp = b0;
+    b0 = b_tmp % DST_DIM1; // batch id0
+    b1 = b_tmp / DST_DIM1; // batch id1
+#endif // defined(DST_DIM1)
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                                 (get_global_id(0) + get_global_id(1) * (uint)SRC_WIDTH + c * (uint)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(DATA_TYPE) +
+                                 b0 * dst_stride_y +
+                                 b1 * dst_stride_z;
+
+    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/floor.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/floor.clembed
new file mode 100644
index 0000000..13e485b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/floor.clembed

@@ -0,0 +1,601 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void floor_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gather.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gather.clembed
new file mode 100644
index 0000000..7854361
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gather.clembed

@@ -0,0 +1,634 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
+ * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
+ * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+    TENSOR4D_DECLARATION(input),
+    VECTOR_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif AXIS == 1
+    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif AXIS == 2
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif AXIS == 3
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif //AXIS
+
+    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gaussian_pyramid.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gaussian_pyramid.clembed
new file mode 100644
index 0000000..fb00445
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gaussian_pyramid.clembed

@@ -0,0 +1,656 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian1x5_sub_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values for the convolution (20 bytes needed)
+    uchar16 temp0 = vload16(0, src.ptr);
+    uchar4  temp1 = vload4(0, src.ptr + 16);
+
+    // Convert to USHORT8
+    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
+    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
+    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
+    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
+    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
+
+    // Compute convolution along the X direction
+    ushort8 pixels = l2_data + r2_data;
+    pixels += l1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += r1_data * (ushort8)4;
+
+    // Store result
+    vstore8(pixels, 0, (__global ushort *)dst.ptr);
+}
+
+/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian5x1_sub_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
+    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
+    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
+    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
+    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
+
+    // Compute convolution along the Y direction
+    ushort8 pixels = u2_data + d2_data;
+    pixels += u1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += d1_data * (ushort8)4;
+
+    // Scale result
+    pixels >>= (ushort8)8;
+
+    // Store result
+    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gemm.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gemm.clembed
new file mode 100644
index 0000000..fc8f9f2
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gemm.clembed

@@ -0,0 +1,8605 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
+    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
+    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
+    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
+    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
+    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
+    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
+    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
+    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE(N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                     \
+    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
+    BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
+    BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 2)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 3)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 4)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 8)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 16)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) \
+    COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X1(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);     \
+    COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+    CONCAT(COLUMN_VECTOR, K0)                         \
+    (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+    CONCAT(TRANSPOSE_K0X, N0)                      \
+    (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+    ADD_ROW_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+    ADD_ROW_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+    ADD_ROW_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+    ADD_ROW_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+    ADD_ROW_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+    ADD_ROW_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+    ADD_ROW_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+    ADD_ROW_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+    ADD_ROW_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+    ADD_ROW_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+    ADD_ROW_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+    ADD_ROW_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+    ADD_ROW_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+    ADD_ROW_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+    ADD_ROW_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)      \
+    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+/** Macros that help in loop unrolling */
+//Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(1, P_A, P_B, P_C);       \
+    REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(2, P_A, P_B, P_C);       \
+    REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(3, P_A, P_B, P_C);       \
+    REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(4, P_A, P_B, P_C);       \
+    REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(5, P_A, P_B, P_C);       \
+    REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(6, P_A, P_B, P_C);       \
+    REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(7, P_A, P_B, P_C);       \
+    REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(8, P_A, P_B, P_C);       \
+    REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(9, P_A, P_B, P_C);        \
+    REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(A, P_A, P_B, P_C);        \
+    REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(B, P_A, P_B, P_C);        \
+    REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(C, P_A, P_B, P_C);        \
+    REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(D, P_A, P_B, P_C);        \
+    REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(E, P_A, P_B, P_C);        \
+    REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(F, P_A, P_B, P_C);        \
+    REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+#endif // ARM_COMPUTE_REPEAT_H
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if(SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
+    ({                                                                                                                                               \
+        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
+    })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+    ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                         ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                        )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    // Load values from the LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+    BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+    BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+    BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+    BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+    BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+    BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+    BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+    BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+    // ---------------------------Store output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
+    ({                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
+        DATA_TYPE res1 = a4.s##i;                                                                     \
+        VSTORE(4)                                                                                     \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+    })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(2)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(3)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
+    ({                                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
+        VSTORE(M0)                                                                                                    \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
+    })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                        ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                       )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    // Load values from the LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+    BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+    BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+    BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+    BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+    BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+    BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+    BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+    BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+    // ---------------------------Transpose and store block -----------------------
+
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
+                                     x / (uint)H0)
+                                 * (uint)dst_stride_y)
+                                 + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#endif // K0 > 1
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Store output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
+                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Transpose the block ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+    // This part computes the following transpositions:
+    // 2x2 -> 2x2
+    // 2x4 -> 4x2
+    // 2x8 -> 8x2
+    // 2x16 -> 16x2
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+    // This part computes the following transpositions:
+    // 3x2 -> 2x3
+    // 3x4 -> 4x3
+    // 3x8 -> 8x3
+    // 3x16 -> 16x3
+    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+    // This part computes the following transpositions:
+    // 4x2 -> 2x4
+    // 4x4 -> 4x4
+    // 4x8 -> 8x4
+    // 4x16 -> 16x4
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+    // This part computes the following transpositions:
+    // 8x2 -> 2x8
+    // 8x4 -> 4x8
+    // 8x8 -> 8x8
+    // 8x16 -> 16x8
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+    // This part computes the following transpositions:
+    // 16x2 -> 2x16
+    // 16x4 -> 4x16
+    // 16x8 -> 8x16
+    // 16x16 -> 16x16
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+    // ---------------------------Store the output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#define ARM_DOT3(a, b, c)           \
+    ({                              \
+        ARM_DOT2(a, b, c);          \
+        c = fma((a.s2), (b.s2), c); \
+    })
+#define ARM_DOT4(a, b, c)           \
+    ({                              \
+        ARM_DOT3(a, b, c);          \
+        c = fma((a.s3), (b.s3), c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                          IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst),
+                                          uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif //defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                         )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+    })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+    })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+    })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+    })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+    })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+    })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
+    })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                            \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+                                           IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst),
+                                           uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif //defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                          )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        LD_RHS_VFMA_M0xN0(0, a, c);
+        LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+        LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+        LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+        LD_RHS_VFMA_M0xN0(4, a, c);
+        LD_RHS_VFMA_M0xN0(5, a, c);
+        LD_RHS_VFMA_M0xN0(6, a, c);
+        LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+        LD_RHS_VFMA_M0xN0(8, a, c);
+        LD_RHS_VFMA_M0xN0(9, a, c);
+        LD_RHS_VFMA_M0xN0(A, a, c);
+        LD_RHS_VFMA_M0xN0(B, a, c);
+        LD_RHS_VFMA_M0xN0(C, a, c);
+        LD_RHS_VFMA_M0xN0(D, a, c);
+        LD_RHS_VFMA_M0xN0(E, a, c);
+        LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        LD_RHS_VFMA_M0xN0(0, a, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+        c += a.s8 * b.s8;   \
+        c += a.s9 * b.s9;   \
+        c += a.sA * b.sA;   \
+        c += a.sB * b.sB;   \
+        c += a.sC * b.sC;   \
+        c += a.sD * b.sD;   \
+        c += a.sE * b.sE;   \
+        c += a.sF * b.sF;   \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint k,
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
+    ({                                                 \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
+    })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+    })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+    })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+    })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+    })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
+// a is the column-vector (transposed)
+// b is the row-vector (not transposed)
+// C is the output matrix
+// Lower case is a vector (a, b)
+// Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
+    ({                                                        \
+        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+    })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
+    (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint k,
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0 = VLOAD(M0)(0, lhs);
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+        rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
+                             IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst),
+                             uint lhs_stride_z,
+                             uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif //defined(BETA)
+                             uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                            )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+        RHS_VFMA_M0xN0(0, a, b0, c);
+        RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+        RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+        RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+        RHS_VFMA_M0xN0(4, a, b4, c);
+        RHS_VFMA_M0xN0(5, a, b5, c);
+        RHS_VFMA_M0xN0(6, a, b6, c);
+        RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+        RHS_VFMA_M0xN0(8, a, b8, c);
+        RHS_VFMA_M0xN0(9, a, b9, c);
+        RHS_VFMA_M0xN0(A, a, bA, c);
+        RHS_VFMA_M0xN0(B, a, bB, c);
+        RHS_VFMA_M0xN0(C, a, bC, c);
+        RHS_VFMA_M0xN0(D, a, bD, c);
+        RHS_VFMA_M0xN0(E, a, bE, c);
+        RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+        RHS_VFMA_M0xN0(0, a, b, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+    int i = 0;
+    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    for(; i < (int)(COLS_MTX_B); ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float8 c0 = 0.0f;
+    float8 c1 = 0.0f;
+    float8 c2 = 0.0f;
+    float8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+        b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+    float8 bias_f1 = convert_float8(bias1);
+    float8 bias_f2 = convert_float8(bias2);
+    float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 c_h0 = convert_half8(c0);
+    half8 c_h1 = convert_half8(c1);
+    half8 c_h2 = convert_half8(c2);
+    half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+    int i = 0;
+    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
+    {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half8 a0 = vload8(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload8(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+    }
+
+    for(; i < (int)(COLS_MTX_B); ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
+                                     IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst),
+                                     uint src0_stride_z,
+                                     uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif //defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                    )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+    VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+        VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0.s0;
+        acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (VECTOR_TYPE)a1.s0;
+        acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (VECTOR_TYPE)a2.s0;
+        acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (VECTOR_TYPE)a3.s0;
+        acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *
+                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float4 acc0 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float4 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float4 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float4 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(float);
+    }
+
+    for(; i < (int)COLS_A; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *
+                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst),
+                                                      uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif //defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                     )
+{
+    // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float2 acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float2 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float2 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float2 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)COLS_A - 8); i += 8)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float) * 8;
+    }
+    // float size increment
+    for(; i < (int)COLS_A; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *
+                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)COLS_A; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
+                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float8 bias_f1 = convert_float8(bias1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float8 bias_f2 = convert_float8(bias2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float8 bias_f3 = convert_float8(bias3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 acc_h0 = convert_half8(acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half8 acc_h1 = convert_half8(acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half8 acc_h2 = convert_half8(acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half8 acc_h3 = convert_half8(acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)COLS_A; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
+                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    // Load values from Matrix C
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    // Store final result in axb matrix
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    // Load values from Matrix C
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    // Store final result in axb matrix
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
+
+#if defined(WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @note The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+                             TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+    int idy = get_global_id(1);
+
+    // Compute the address for the vector A and matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    // Vector size, e.g. number of vector elements.
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+    accum_value  = biases_value + accum_value;
+    // Store result in the accumulate buffer
+    VSTORE(VECTOR_SIZE)
+    (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gemm_helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gemm_helpers.hembed
new file mode 100644
index 0000000..9b74385
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gemm_helpers.hembed

@@ -0,0 +1,2164 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
+    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
+    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
+    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
+    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
+    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
+    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
+    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
+    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE(N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                     \
+    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
+    BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
+    BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 2)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 3)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 4)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 8)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 16)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) \
+    COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X1(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);     \
+    COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+    CONCAT(COLUMN_VECTOR, K0)                         \
+    (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+    CONCAT(TRANSPOSE_K0X, N0)                      \
+    (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+    ADD_ROW_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+    ADD_ROW_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+    ADD_ROW_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+    ADD_ROW_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+    ADD_ROW_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+    ADD_ROW_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+    ADD_ROW_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+    ADD_ROW_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+    ADD_ROW_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+    ADD_ROW_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+    ADD_ROW_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+    ADD_ROW_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+    ADD_ROW_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+    ADD_ROW_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+    ADD_ROW_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)      \
+    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gemmlowp.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gemmlowp.clembed
new file mode 100644
index 0000000..04d385d
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gemmlowp.clembed

@@ -0,0 +1,5380 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
+    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
+    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
+    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
+    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
+    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
+    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
+    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
+    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE(N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                     \
+    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
+    BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
+    BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 2)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 3)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 4)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 8)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
+    VEC_DATA_TYPE(TYPE, 16)                         \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) \
+    COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X1(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);    \
+    COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);  \
+    COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+    TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);     \
+    COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);   \
+    COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+    CONCAT(COLUMN_VECTOR, K0)                         \
+    (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+    CONCAT(TRANSPOSE_K0X, N0)                      \
+    (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+    ADD_ROW_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+    ADD_ROW_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+    ADD_ROW_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+    ADD_ROW_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+    ADD_ROW_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+    ADD_ROW_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+    ADD_ROW_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+    ADD_ROW_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+    ADD_ROW_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+    ADD_ROW_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+    ADD_ROW_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+    ADD_ROW_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+    ADD_ROW_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+    ADD_ROW_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+    ADD_ROW_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
+    BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
+    BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
+    BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
+    BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
+    BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
+    BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
+    BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
+    BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
+    BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
+    BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
+    BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
+    BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
+    BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
+    BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
+    BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
+    BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)      \
+    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
+    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
+    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
+    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+/** Macros that help in loop unrolling */
+//Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(1, P_A, P_B, P_C);       \
+    REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(2, P_A, P_B, P_C);       \
+    REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(3, P_A, P_B, P_C);       \
+    REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(4, P_A, P_B, P_C);       \
+    REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(5, P_A, P_B, P_C);       \
+    REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(6, P_A, P_B, P_C);       \
+    REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(7, P_A, P_B, P_C);       \
+    REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(8, P_A, P_B, P_C);       \
+    REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(9, P_A, P_B, P_C);        \
+    REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(A, P_A, P_B, P_C);        \
+    REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(B, P_A, P_B, P_C);        \
+    REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(C, P_A, P_B, P_C);        \
+    REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(D, P_A, P_B, P_C);        \
+    REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(E, P_A, P_B, P_C);        \
+    REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(F, P_A, P_B, P_C);        \
+    REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+#endif // ARM_COMPUTE_REPEAT_H
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. These macros use the dot8 instruction */
+#define ARM_DOT1(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+    })
+#define ARM_DOT2(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+    })
+#define ARM_DOT3(a, b, c)                                                                                           \
+    ({                                                                                                              \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+    })
+#define ARM_DOT4(a, b, c) \
+    ({                    \
+        ARM_DOT(a, b, c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c)          \
+    ({                             \
+        c += (ACC_DATA_TYPE)a * b; \
+    })
+#define ARM_DOT2(a, b, c)                \
+    ({                                   \
+        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+    })
+#define ARM_DOT3(a, b, c)                \
+    ({                                   \
+        ARM_DOT2(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+    })
+#define ARM_DOT4(a, b, c)                \
+    ({                                   \
+        ARM_DOT3(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X2(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+    })
+#define ARM_DOT_K0X3(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X2(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+    })
+#define ARM_DOT_K0X4(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X3(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+    })
+#define ARM_DOT_K0X8(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X4(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+    })
+#define ARM_DOT_K0X16(k0, a, b, c)           \
+    ({                                       \
+        ARM_DOT_K0X8(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+    })
+
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+    })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+    ({                          \
+        CONCAT(ARM_DOT, k0)     \
+        ((a), (b), (c));        \
+    })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+    ({                                 \
+        CONCAT(ARM_DOT_K0X, n0)        \
+        (k0, (a), b, (c));             \
+    })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+    ({                                       \
+        CONCAT(ARM_MM_K0XN0X, m0)            \
+        (n0, k0, a, b, c);                   \
+    })
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst),
+                                  uint src0_stride_z,
+                                  uint src1_stride_z,
+                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                  ,
+                                  uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                  ,
+                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                 )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    int end_row_vec_a = src_addr.s0 + COLS_A;
+
+    VECTOR_ACC_TYPE acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VECTOR_ACC_TYPE acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VECTOR_ACC_TYPE acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VECTOR_ACC_TYPE acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VECTOR_ACC_TYPE acc4 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+    {
+        // Load values from matrix A
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+        VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0.s0;
+        acc0 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1.s0;
+        acc1 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2.s0;
+        acc2 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3.s0;
+        acc3 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        acc4 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4.s0;
+        acc4 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    {
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        DATA_TYPE a4 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        acc4 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    }
+
+    const int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
+    // Store the result
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += z * dst_stride_z;
+
+    // Store the result
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                                IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst),
+                                                uint k,
+                                                uint lhs_stride_z,
+                                                uint rhs_stride_z,
+                                                uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                               )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+        // Update address
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                              IMAGE_DECLARATION(rhs),
+                                              IMAGE_DECLARATION(dst),
+                                              uint lhs_stride_z,
+                                              uint rhs_stride_z,
+                                              uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                              ,
+                                              uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                              ,
+                                              uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                             )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+        lhs_offset += K0;
+        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
+                                 IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst),
+                                 uint lhs_stride_z,
+                                 uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                )
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+
+        // Update the offset
+        lhs_offset += K0;
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over for loop
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+        // Partial matrix multiplication M0,N0,1
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+
+        // Update the offset
+        lhs_offset += 1;
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+                                               IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 32); i += 32)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        a0 = vload16(0, matrix_a + i);
+
+        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+        a0 = vload16(1, matrix_a + i);
+
+        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+
+    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16)
+        b0 = vload16(0, matrix_b + 0 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, 16)
+        b1 = vload16(0, matrix_b + 1 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, 16)
+        b2 = vload16(0, matrix_b + 2 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, 16)
+        b3 = vload16(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE,
+                      16));
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16)
+        b0 = vload16(0, matrix_b);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+        matrix_b += src_stride_y;
+    }
+
+    vstore16(convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ */
+inline int4 offset_contribution(
+    int x,
+    int y,
+    int z
+#if defined(A_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+    int4 a_offset_s32 = (int4)0;
+    int4 b_offset_s32 = (int4)0;
+
+    int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+    batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+    a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+    // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                           ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+                                          )
+{
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
+#if defined(A_OFFSET)
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    vstore4(in_s32, 0, (__global int *)mm_result_addr);
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                         ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                         ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                        )
+{
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
+#if defined(A_OFFSET)
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (int4)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+    int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
+
+    in_s32 *= result_multipliers_values;
+    in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    in_s32 *= RESULT_MULTIPLIER;
+
+    in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+    res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                                    ,
+#if defined(ADD_BIAS)
+                                                                    VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                    TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                                    ,
+                                                                    VECTOR_DECLARATION(result_multipliers),
+                                                                    VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                                   )
+{
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
+#if defined(A_OFFSET)
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+    int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
+
+    int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    in_s32                = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (int4)RESULT_OFFSET;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+    res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Add the offset terms to GEMM's result
+    input_values += (int4)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+    input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+    input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+    uchar4 res = convert_uchar4_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+    // Add the offset terms to GEMM's result
+    input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+    res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+    short4 res = convert_short4_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (short4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (short4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, (__global short *)dst_addr);
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+    // Compute source and destination addresses
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Convert to float
+    float16 input_values_f = convert_float4(input_values);
+    input_values_f         = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+    uchar4 res = convert_uchar4_sat(input_values_f);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, dst_addr);
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/gemv.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/gemv.clembed
new file mode 100644
index 0000000..d51bcc5
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/gemv.clembed

@@ -0,0 +1,743 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    DATA_TYPE acc0 = (DATA_TYPE)0;
+    DATA_TYPE acc1 = (DATA_TYPE)0;
+    DATA_TYPE acc2 = (DATA_TYPE)0;
+    DATA_TYPE acc3 = (DATA_TYPE)0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+        acc0 += dot(weights, tmp0);
+        acc1 += dot(weights, tmp1);
+        acc2 += dot(weights, tmp2);
+        acc3 += dot(weights, tmp3);
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+
+/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  input_offset                          Input's quantization offset
+ * @param[in]  weights_offset                        Weights's quantization offset
+ */
+__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
+                                IMAGE_DECLARATION(weights),
+                                VECTOR_DECLARATION(dst),
+                                const int input_offset,
+                                const int weights_offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    int acc0 = 0;
+    int acc1 = 0;
+    int acc2 = 0;
+    int acc3 = 0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
+
+        // Accumulate
+        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
+        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
+        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
+        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals.clembed
new file mode 100644
index 0000000..72dbf92
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals.clembed

@@ -0,0 +1,631 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const DATA_TYPE
+    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
+    const DATA_TYPE
+    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+    // Apply the shift to the anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals_quantized.clembed
new file mode 100644
index 0000000..2cce0b9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/generate_proposals_quantized.clembed

@@ -0,0 +1,1119 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
+__kernel void generate_proposals_compute_all_anchors_quantized(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
+
+    // Apply the shift to the anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    VSTORE(NUM_ROI_FIELDS)
+    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/harris_corners.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/harris_corners.clembed
new file mode 100644
index 0000000..1e98dcb
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/harris_corners.clembed

@@ -0,0 +1,919 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Function running harris score on 3x3 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_3x3(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float4 gx2  = (float4)0.0f;
+    float4 gy2  = (float4)0.0f;
+    float4 gxgy = (float4)0.0f;
+
+    /* Row0 */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
+
+    float4 l_gx = convert_float4(temp_gx.s0123);
+    float4 m_gx = convert_float4(temp_gx.s1234);
+    float4 r_gx = convert_float4(temp_gx.s2345);
+
+    float4 l_gy = convert_float4(temp_gy.s0123);
+    float4 m_gy = convert_float4(temp_gy.s1234);
+    float4 r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row1 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row2 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x5.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
+
+    float4 gx = convert_float4(temp_gx.s0123);
+    float4 gy = convert_float4(temp_gy.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s1234);
+    gy = convert_float4(temp_gy.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s2345);
+    gy = convert_float4(temp_gy.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s3456);
+    gy = convert_float4(temp_gy.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s4567);
+    gy = convert_float4(temp_gy.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 5x5 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_5x5(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -2; i < 3; i++)
+    {
+        res += harris_score_1x5(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x7.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
+
+    float4 gx = convert_float4(temp_gx0.s0123);
+    float4 gy = convert_float4(temp_gy0.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s1234);
+    gy = convert_float4(temp_gy0.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s2345);
+    gy = convert_float4(temp_gy0.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s3456);
+    gy = convert_float4(temp_gy0.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s4567);
+    gy = convert_float4(temp_gy0.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 7x7 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_7x7(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -3; i < 4; i++)
+    {
+        res += harris_score_1x7(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/helpers.hembed
new file mode 100644
index 0000000..88fe511
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/helpers.hembed

@@ -0,0 +1,544 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/helpers_asymm.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/helpers_asymm.hembed
new file mode 100644
index 0000000..d5897be
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/helpers_asymm.hembed

@@ -0,0 +1,1033 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/histogram.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/histogram.clembed
new file mode 100644
index 0000000..08b0ffd
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/histogram.clembed

@@ -0,0 +1,786 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define VATOMIC_INC16(histogram, win_pos)   \
+    {                                       \
+        atomic_inc(histogram + win_pos.s0); \
+        atomic_inc(histogram + win_pos.s1); \
+        atomic_inc(histogram + win_pos.s2); \
+        atomic_inc(histogram + win_pos.s3); \
+        atomic_inc(histogram + win_pos.s4); \
+        atomic_inc(histogram + win_pos.s5); \
+        atomic_inc(histogram + win_pos.s6); \
+        atomic_inc(histogram + win_pos.s7); \
+        atomic_inc(histogram + win_pos.s8); \
+        atomic_inc(histogram + win_pos.s9); \
+        atomic_inc(histogram + win_pos.sa); \
+        atomic_inc(histogram + win_pos.sb); \
+        atomic_inc(histogram + win_pos.sc); \
+        atomic_inc(histogram + win_pos.sd); \
+        atomic_inc(histogram + win_pos.se); \
+        atomic_inc(histogram + win_pos.sf); \
+    }
+
+/** Calculate the histogram of an 8 bit grayscale image.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
+                                __local uint *histogram_local,
+                                __global uint *restrict histogram,
+                                uint                    num_bins,
+                                uint                    offset,
+                                uint                    range,
+                                uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    uint  local_id_x   = get_local_id(0);
+
+    uint local_x_size = get_local_size(0);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            histogram_local[i] = 0;
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            histogram_local[local_id_x] = 0;
+        }
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    VATOMIC_INC16(histogram_local, win_pos);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            atomic_add(histogram + i, histogram_local[i]);
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
+        }
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image's border.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
+                                 __global uint *restrict histogram,
+                                 uint                    num_bins,
+                                 uint                    offset,
+                                 uint                    range,
+                                 uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint val = (uint)(*input_buffer.ptr);
+
+    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
+
+    if(val >= offset && (val < offrange))
+    {
+        atomic_inc(histogram + win_pos);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256 elements
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
+                                      __local uint *histogram_local,
+                                      __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint local_index  = get_local_id(0);
+    uint local_x_size = get_local_size(0);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        histogram_local[i] = 0;
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    atomic_inc(histogram_local + vals.s0);
+    atomic_inc(histogram_local + vals.s1);
+    atomic_inc(histogram_local + vals.s2);
+    atomic_inc(histogram_local + vals.s3);
+    atomic_inc(histogram_local + vals.s4);
+    atomic_inc(histogram_local + vals.s5);
+    atomic_inc(histogram_local + vals.s6);
+    atomic_inc(histogram_local + vals.s7);
+    atomic_inc(histogram_local + vals.s8);
+    atomic_inc(histogram_local + vals.s9);
+    atomic_inc(histogram_local + vals.sa);
+    atomic_inc(histogram_local + vals.sb);
+    atomic_inc(histogram_local + vals.sc);
+    atomic_inc(histogram_local + vals.sd);
+    atomic_inc(histogram_local + vals.se);
+    atomic_inc(histogram_local + vals.sf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        atomic_add(histogram + i, histogram_local[i]);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
+                                       __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    atomic_inc(histogram + *input_buffer.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/hog.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/hog.clembed
new file mode 100644
index 0000000..7b991dd
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/hog.clembed

@@ -0,0 +1,1054 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
+
+#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
+ * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
+ * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
+ * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+                                      IMAGE_DECLARATION(phase),
+                                      IMAGE_DECLARATION(dst))
+{
+    float bins[NUM_BINS] = { 0 };
+
+    // Compute address for the magnitude and phase images
+    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+    __global uchar *mag_row_ptr   = mag.ptr;
+    __global uchar *phase_row_ptr = phase.ptr;
+
+    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+    {
+        int xc = 0;
+        for(; xc <= (CELL_WIDTH - 4); xc += 4)
+        {
+            // Load magnitude and phase values
+            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+            // Compute histogram index.
+            int4 hidx_s32 = convert_int4(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float4 hidx_f32 = convert_float4(hidx_s32);
+
+            // w1 = phase_f32 - hidx_s32
+            const float4 w1_f32 = phase_f32 - hidx_f32;
+
+            // w0 = 1.0 - w1
+            const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+            // Calculate the weights for splitting vote
+            const float4 mag_w0_f32 = mag_f32 * w0_f32;
+            const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin 0
+            bins[hidx_s32.s0] += mag_w0_f32.s0;
+            bins[hidx_s32.s1] += mag_w0_f32.s1;
+            bins[hidx_s32.s2] += mag_w0_f32.s2;
+            bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+            hidx_s32 += (int4)1;
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin1
+            bins[hidx_s32.s0] += mag_w1_f32.s0;
+            bins[hidx_s32.s1] += mag_w1_f32.s1;
+            bins[hidx_s32.s2] += mag_w1_f32.s2;
+            bins[hidx_s32.s3] += mag_w1_f32.s3;
+        }
+
+        // Left over computation
+        for(; xc < CELL_WIDTH; xc++)
+        {
+            const float mag_value   = *((__global short *)mag_row_ptr + xc);
+            const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float w1          = phase_value - floor(phase_value);
+
+            // The quantised phase is the histogram index [0, NUM_BINS - 1]
+            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+            const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+            // Weighted vote between 2 bins
+            bins[hidx] += mag_value * (1.0f - w1);
+            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+        }
+
+        // Point to the next row of magnitude and phase images
+        mag_row_ptr += mag_stride_y;
+        phase_row_ptr += phase_stride_y;
+    }
+
+    // Compute address for the destination image
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the local HOG in the global memory
+    int xc = 0;
+    for(; xc <= (NUM_BINS - 4); xc += 4)
+    {
+        float4 values = vload4(0, bins + xc);
+
+        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+    }
+
+    // Left over stores
+    for(; xc < NUM_BINS; ++xc)
+    {
+        ((__global float *)dst.ptr)[xc] = bins[xc];
+    }
+}
+#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
+
+#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif /* not L2_NORM */
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif /* not L2HYS_NORM */
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif /* not L1_NORM */
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    float  sum     = 0.0f;
+    float4 sum_f32 = (float4)(0.0f);
+
+    // Compute address for the source and destination tensor
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+    {
+        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+        int xc = 0;
+        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+        {
+            const float4 val0 = vload4(0, hist_ptr + xc + 0);
+            const float4 val1 = vload4(0, hist_ptr + xc + 4);
+            const float4 val2 = vload4(0, hist_ptr + xc + 8);
+            const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            // Compute val^2 for L2_NORM or L2HYS_NORM
+            sum_f32 += val0 * val0;
+            sum_f32 += val1 * val1;
+            sum_f32 += val2 * val2;
+            sum_f32 += val3 * val3;
+#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
+            // Compute |val| for L1_NORM
+            sum_f32 += fabs(val0);
+            sum_f32 += fabs(val1);
+            sum_f32 += fabs(val2);
+            sum_f32 += fabs(val3);
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
+
+            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+            // will be accessed consecutively
+            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+        }
+
+        // Compute left over
+        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+        {
+            const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            sum += val * val;
+#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
+            sum += fabs(val);
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
+
+            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+        }
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+    // Reset sum
+    sum_f32 = (float4)0.0f;
+    sum     = 0.0f;
+
+    int k = 0;
+    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+        // Scale val
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        // Clip val if over _threshold_l2hys
+        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+        // Compute val^2
+        sum_f32 += val0 * val0;
+        sum_f32 += val1 * val1;
+        sum_f32 += val2 * val2;
+        sum_f32 += val3 * val3;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+    }
+
+    // Compute left over
+    for(; k < NUM_BINS_PER_BLOCK; ++k)
+    {
+        float val = ((__global float *)dst.ptr)[k] * scale;
+
+        // Clip scaled input_value if over L2_HYST_THRESHOLD
+        val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+        sum += val * val;
+
+        ((__global float *)dst.ptr)[k] = val;
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    // We use the same constants of OpenCV
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
+
+    int i = 0;
+    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+        // Multiply val by the normalization scale factor
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+    }
+
+    for(; i < NUM_BINS_PER_BLOCK; ++i)
+    {
+        ((__global float *)dst.ptr)[i] *= scale;
+    }
+}
+#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
+
+#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
+ * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst                               Pointer to DetectionWindow array
+ * @param[out] num_detection_windows             Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+                           __global float *hog_descriptor,
+                           __global DetectionWindow *dst,
+                           __global uint *num_detection_windows)
+{
+    // Check if the DetectionWindow array is full
+    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+    {
+        return;
+    }
+
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+    // Init score_f32 with 0
+    float4 score_f32 = (float4)0.0f;
+
+    // Init score with 0
+    float score = 0.0f;
+
+    __global float *src_row_ptr = (__global float *)src.ptr;
+
+    // Compute Linear SVM
+    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+    {
+        int xb = 0;
+
+        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+        {
+            // Load descriptor values
+            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+            // Multiply accumulate
+            score_f32 += a0_f32 * b0_f32;
+            score_f32 += a1_f32 * b1_f32;
+        }
+
+        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+        {
+            const float a = src_row_ptr[xb];
+            const float b = hog_descriptor[xb + offset_y];
+
+            score += a * b;
+        }
+    }
+
+    score += dot(score_f32, (float4)1.0f);
+
+    // Add the bias. The bias is located at the position (descriptor_size() - 1)
+    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+    if(score > (float)THRESHOLD)
+    {
+        int id = atomic_inc(num_detection_windows);
+        if(id < MAX_NUM_DETECTION_WINDOWS)
+        {
+            dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
+            dst[id].width     = DETECTION_WINDOW_WIDTH;
+            dst[id].height    = DETECTION_WINDOW_HEIGHT;
+            dst[id].idx_class = IDX_CLASS;
+            dst[id].score     = score;
+        }
+    }
+}
+#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
+        * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/im2col.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/im2col.clembed
new file mode 100644
index 0000000..9f5f138
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/im2col.clembed

@@ -0,0 +1,1835 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col1x1_stridex1_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const uint xc    = get_global_id(0) * 4;         // x coordinate in the convolved tensor
+    const uint yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const uint ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Clamp xc
+    // The strategy clamps at "xc" as it will be a valid value for sure
+    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
+
+    // Check which values are valid
+    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+
+    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
+
+    // Calculate input indices
+    const uint xi = xc;
+    const uint yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+
+#if defined(NUM_GROUPS)
+    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                                   // defined(NUM_GROUPS)
+    const uint xo              = ch;
+#endif                                                  // defined(NUM_GROUPS)
+    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input_ptr);
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
+
+    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
+    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
+    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
+    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
+        {
+            int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                *output_ptr = PAD_VALUE;
+            }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
+
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 9; // 3x3
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int3 x = (int3)xi + (int3)(0, 1, 2);
+    int3 y = (int3)yi + (int3)(0, 1, 2);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+
+    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
+    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col5x5_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 25; // 5x5
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
+    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
+    int  x1 = xi + 4;
+    int  y1 = yi + 4;
+
+    // Check if we could have out-of-bounds elements in the x direction
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
+    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
+        *((__global DATA_TYPE *)output_ptr + 4) = row01;
+
+        output_ptr += 5 * dst_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col11x11_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 121; // 11x11
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        output_ptr += 11 * src_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+    const int yo    = get_global_id(1);
+    const int batch = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int3 offset   = 0;
+
+    // Clamp xi
+    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+    xi_offset *= (int3)src_stride_y;
+
+    // Out-of-bound condition for X
+    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+    // yi == 0
+    // Clamp yi
+    // yi_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    yi_coord = yi - (int)PAD_TOP;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 1
+    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with zeros
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 2
+    // Clamp yi_coord
+    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // Store
+    VSTORE(VECTOR_SIZE)
+    (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);
+    VSTORE(VECTOR_SIZE)
+    (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);
+
+#ifdef HAS_BIAS
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                                                                                       \
+    ({                                                                                                                                                                     \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                                                                                     \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                                                                                            \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                                                                                             \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                                                                                             \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));                                                                          \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));                                                                          \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));                                                                          \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));                                                                          \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));                                                                          \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));                                                                          \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));                                                                          \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));                                                                          \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));                                                                             \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                                                                              \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s0)); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s1)); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s2)); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s3)); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s4)); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s5)); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s6)); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s7)); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond1));    \
+        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);                                                                                        \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                              \
+    ({                                                                                            \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                            \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                   \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                    \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                    \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));    \
+        \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);               \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+    const int yo    = get_global_id(1);
+    const int batch = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+    const int yo    = get_global_id(1);
+    const int batch = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int i = 0;
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        // Clamp yi_coord
+        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+        // Out-of-bound condition for Y
+        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+        {
+            // Clamp xi_coord
+            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+            // Out-of-bound condition for X
+            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+            // Replace with PAD_VALUE if the value is out-of-bound
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));
+
+            // Store
+            VSTORE(VECTOR_SIZE)
+            (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+
+            i++;
+        }
+    }
+
+#ifdef HAS_BIAS
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/instance_normalization.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/instance_normalization.clembed
new file mode 100644
index 0000000..177de4e
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/instance_normalization.clembed

@@ -0,0 +1,733 @@
+R"(
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
+ * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void instance_normalization(
+    TENSOR4D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
+    INTERNAL_DATA_TYPE sum    = 0.f;
+    INTERNAL_DATA_TYPE sum_sq = 0.f;
+
+#if defined(NHWC)
+
+    const int ch             = get_global_id(0); // Current channel
+    const int batch          = get_global_id(2); // Current batch
+    const int elements_plane = DIM_Y * DIM_Z;
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+            sum += data;
+            sum_sq += data * data;
+        }
+    }
+
+#else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
+    const int elements_plane = DIM_X * DIM_Y;
+
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum = 0.f;
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum_sq = 0.f;
+    // Calculate partial sum
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
+    }
+    // Perform reduction
+#if VEC_SIZE > 8
+    part_sum.s01234567 += part_sum.s89abcdef;
+    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+    part_sum.s0123 += part_sum.s4567;
+    part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+    part_sum.s01 += part_sum.s23;
+    part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+    part_sum.s0 += part_sum.s1;
+    part_sum_sq.s0 += part_sum_sq.s1;
+
+    sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
+    sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+
+#endif // defined(NHWC)
+
+    const INTERNAL_DATA_TYPE mean   = (sum / elements_plane);
+    const INTERNAL_DATA_TYPE var    = (sum_sq / elements_plane) - (mean * mean);
+    const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON);
+
+#if defined(NHWC)
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+
+#else // !defined(NHWC)
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/integral_image.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/integral_image.clembed
new file mode 100644
index 0000000..6c8cd93
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/integral_image.clembed

@@ -0,0 +1,643 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void integral_horizontal(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint prev = 0;
+
+    for(uint j = 0; j < src_step_x; j += 16)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
+        res.s0 += prev;
+        res.s1 += res.s0;
+        res.s2 += res.s1;
+        res.s3 += res.s2;
+        res.s4 += res.s3;
+        res.s5 += res.s4;
+        res.s6 += res.s5;
+        res.s7 += res.s6;
+        res.s8 += res.s7;
+        res.s9 += res.s8;
+        res.sA += res.s9;
+        res.sB += res.sA;
+        res.sC += res.sB;
+        res.sD += res.sC;
+        res.sE += res.sD;
+        res.sF += res.sE;
+        prev = res.sF;
+        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
+    }
+}
+
+/** This function computes the vertical integral of the image.
+ *
+ * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
+ * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     height                            Image height.
+ */
+__kernel void integral_vertical(
+    IMAGE_DECLARATION(src),
+    uint height)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
+    for(uint j = 1; j < height; ++j)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
+        res += prev;
+        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
+        prev = res;
+    }
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/l2_normalize.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/l2_normalize.clembed
new file mode 100644
index 0000000..de40f61
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/l2_normalize.clembed

@@ -0,0 +1,707 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This kernel performs l2 normalization on x-axis
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D sum = CONVERT_TO_TENSOR3D_STRUCT(sum);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/magnitude_phase.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/magnitude_phase.clembed
new file mode 100644
index 0000000..0aba315
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/magnitude_phase.clembed

@@ -0,0 +1,705 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Calculates L1 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L1 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates L2 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L2 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
+{
+    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
+                       VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates unsigned phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
+ */
+inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+    return convert_uchar16(angle_deg_f32);
+}
+
+/** Calculates signed phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
+ */
+inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
+    arct         = select(arct, arct + 2, arct < 0.0f);
+
+    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu);
+}
+
+#if(1 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
+#elif(2 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
+#else /* MAGNITUDE */
+#define MAGNITUDE_OP(x, y)
+#endif /* MAGNITUDE */
+
+#if(1 == PHASE)
+#define PHASE_OP(x, y) phase_unsigned((x), (y))
+#elif(2 == PHASE)
+#define PHASE_OP(x, y) phase_signed((x), (y))
+#else /* PHASE */
+#define PHASE_OP(x, y)
+#endif /* PHASE */
+
+/** Calculate the magnitude and phase of given the gradients of an image.
+ *
+ * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
+ * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
+ *
+ * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
+ * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
+ * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
+ *
+ * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
+ * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
+ * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
+ * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
+ * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
+ * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
+ * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
+ * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
+ * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
+ * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * */
+__kernel void magnitude_phase(
+    IMAGE_DECLARATION(gx),
+    IMAGE_DECLARATION(gy)
+#ifdef MAGNITUDE
+    ,
+    IMAGE_DECLARATION(magnitude)
+#endif /* MAGNITUDE */
+#ifdef PHASE
+    ,
+    IMAGE_DECLARATION(phase)
+#endif /* PHASE */
+)
+{
+    // Get pixels pointer
+    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
+    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
+
+    // Calculate and store the results
+#ifdef MAGNITUDE
+    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
+    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
+#endif /* MAGNITUDE */
+#ifdef PHASE
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
+#endif /* PHASE */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev.clembed
new file mode 100644
index 0000000..ab4fb17
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev.clembed

@@ -0,0 +1,625 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  height                            Height of the input image
+ * @param[out] global_sum                        Global sum of all elements
+ * @param[out] global_sum_sq                     Global sum of squares of all elements
+ */
+__kernel void mean_stddev_accumulate(
+    IMAGE_DECLARATION(src),
+    uint     height,
+    __global ulong *global_sum
+#ifdef STDDEV
+    ,
+    __global ulong *global_sum_sq
+#endif /* STDDEV */
+)
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8 tmp_sum = 0;
+#ifdef STDDEV
+    uint8 tmp_sum_sq = 0;
+#endif /* STDDEV */
+    // Calculate partial sum
+    for(int i = 0; i < height; i++)
+    {
+        // Load data
+        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
+
+        tmp_sum += data;
+#ifdef STDDEV
+        tmp_sum_sq += data * data;
+#endif /* STDDEV */
+    }
+    // Perform reduction
+    tmp_sum.s0123 += tmp_sum.s4567;
+    tmp_sum.s01 += tmp_sum.s23;
+    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
+
+#ifdef STDDEV
+    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
+    tmp_sum_sq.s01 += tmp_sum_sq.s23;
+    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
+#endif /* STDDEV */
+}
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev_normalization.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev_normalization.clembed
new file mode 100644
index 0000000..3ff83ac
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/mean_stddev_normalization.clembed

@@ -0,0 +1,667 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void mean_stddev_normalization(
+    IMAGE_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    IMAGE_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+#ifdef IN_PLACE
+    Image out = in;
+#else  /* IN_PLACE */
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+#endif /* IN_PLACE */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum = 0.f;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum_sq = 0.f;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+        sum_sq += data * data;
+    }
+    // Perform reduction
+#if VEC_SIZE > 8
+    sum.s01234567 += sum.s89abcdef;
+    sum_sq.s01234567 += sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+    sum.s0123 += sum.s4567;
+    sum_sq.s0123 += sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+    sum.s01 += sum.s23;
+    sum_sq.s01 += sum_sq.s23;
+#endif // VEC_SIZE > 2
+    sum.s0 += sum.s1;
+    sum_sq.s0 += sum_sq.s1;
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum.s0 += data;
+        sum_sq.s0 += data * data;
+    }
+
+    DATA_TYPE mean       = sum.s0 / WIDTH;
+    DATA_TYPE var        = (sum_sq.s0 / WIDTH) - (mean * mean);
+    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        res = (data - mean) * stddev_inv;
+        VSTORE(VEC_SIZE)
+        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/memset.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/memset.clembed
new file mode 100644
index 0000000..ab13d87
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/memset.clembed

@@ -0,0 +1,610 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+    TENSOR3D_DECLARATION(tensor))
+{
+    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = (DATA_TYPE)(CONSTANT_VALUE);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/minmax_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/minmax_layer.clembed
new file mode 100644
index 0000000..fb32ae0
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/minmax_layer.clembed

@@ -0,0 +1,644 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    float4 min_value     = (float4)FLT_MAX;
+    float4 max_value     = (float4) - FLT_MAX;
+    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+    for(int z = 0; z < DEPTH; ++z)
+    {
+        for(int y = 0; y < HEIGHT; ++y)
+        {
+            int             x        = 0;
+            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+            for(; x <= (int)(WIDTH - 8); x += 8)
+            {
+                float8 value = *(src_addr + x);
+
+                min_value = select(value.s0123, min_value, min_value < value.s0123);
+                min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+                max_value = select(value.s0123, max_value, max_value > value.s0123);
+                max_value = select(value.s4567, max_value, max_value > value.s4567);
+            }
+
+            for(; x < WIDTH; ++x)
+            {
+                float value = *(src_addr + x);
+
+                min_max_value.s0 = min(min_max_value.s0, value);
+                min_max_value.s1 = max(min_max_value.s1, value);
+            }
+        }
+    }
+
+    // Perform min/max reduction
+    min_value.s01 = min(min_value.s01, min_value.s23);
+    min_value.s0  = min(min_value.s0, min_value.s1);
+    max_value.s01 = max(max_value.s01, max_value.s23);
+    max_value.s0  = max(max_value.s0, max_value.s1);
+
+    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+    if(min_max_value.s0 == min_max_value.s1)
+    {
+        min_max_value.s0 = 0.0f;
+        min_max_value.s1 = 1.0f;
+    }
+
+    // Store min and max
+    vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/minmaxloc.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/minmaxloc.clembed
new file mode 100644
index 0000000..f5b5df3
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/minmaxloc.clembed

@@ -0,0 +1,791 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
+
+#ifndef DATA_TYPE_MIN
+#define DATA_TYPE_MIN 0x0
+#endif /* DATA_TYPE_MIN */
+
+#ifndef DATA_TYPE_MAX
+#define DATA_TYPE_MAX 0xFF
+#endif /* DATA_TYPE_MAX */
+
+inline int FloatFlip(float val)
+{
+    union
+    {
+        int   int_val;
+        float flt_val;
+    } u_val;
+    u_val.flt_val = val;
+    return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
+}
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
+__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** This function identifies the min and maximum value of an input image.
+ *
+ * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
+ * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
+ * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[in]  width                             Input image width
+ */
+__kernel void minmax(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    int           width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Initialize local minimum and local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_min = type_max;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_max = type_min;
+
+    // Calculate min/max of row
+    int i = 0;
+    for(; i + 16 <= width; i += 16)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
+        local_min = min(data, local_min);
+        local_max = max(data, local_max);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
+#ifdef IS_DATA_TYPE_FLOAT
+    int16 valid_indices = (i + idx16) < width;
+#else  /* IS_DATA_TYPE_FLOAT */
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif /* IS_DATA_TYPE_FLOAT */
+    local_max = max(local_max, select(type_min, data, valid_indices));
+    local_min = min(local_min, select(type_max, data, valid_indices));
+#endif /* NON_MULTIPLE_OF_16 */
+
+    // Perform min/max reduction
+    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
+    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
+
+    local_min.s0123 = min(local_min.s0123, local_min.s4567);
+    local_max.s0123 = max(local_max.s0123, local_max.s4567);
+
+    local_min.s01 = min(local_min.s01, local_min.s23);
+    local_max.s01 = max(local_max.s01, local_max.s23);
+
+    local_min.s0 = min(local_min.s0, local_min.s1);
+    local_max.s0 = max(local_max.s0, local_max.s1);
+
+    // Update global min/max
+#ifdef IS_DATA_TYPE_FLOAT
+    atomic_min(&min_max[0], FloatFlip(local_min.s0));
+    atomic_max(&min_max[1], FloatFlip(local_max.s0));
+#else  /* IS_DATA_TYPE_FLOAT */
+    atomic_min(&min_max[0], local_min.s0);
+    atomic_max(&min_max[1], local_max.s0);
+#endif /* IS_DATA_TYPE_FLOAT */
+}
+
+/** This function counts the min and max occurrences in an image and tags their position.
+ *
+ * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
+ * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
+ * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
+ * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
+ * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
+ * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
+ */
+__kernel void minmaxloc(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    __global uint *min_max_count
+#ifdef LOCATE_MIN
+    ,
+    __global Coordinates2D *min_loc, uint max_min_loc_count
+#endif /* LOCATE_MIN */
+#ifdef LOCATE_MAX
+    ,
+    __global Coordinates2D *max_loc, uint max_max_loc_count
+#endif /* LOCATE_MAX */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+#ifdef IS_DATA_TYPE_FLOAT
+    __global float *min_max_ptr = (__global float *)min_max;
+    float           min_value   = min_max_ptr[0];
+    float           max_value   = min_max_ptr[1];
+#else  /* IS_DATA_TYPE_FLOAT */
+    int min_value = min_max[0];
+    int max_value = min_max[1];
+#endif /* IS_DATA_TYPE_FLOAT */
+
+    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
+#ifdef COUNT_MIN_MAX
+    if(value == min_value)
+    {
+        uint idx = atomic_inc(&min_max_count[0]);
+#ifdef LOCATE_MIN
+        if(idx < max_min_loc_count)
+        {
+            min_loc[idx].x = get_global_id(0);
+            min_loc[idx].y = get_global_id(1);
+        }
+#endif /* LOCATE_MIN */
+    }
+    if(value == max_value)
+    {
+        uint idx = atomic_inc(&min_max_count[1]);
+#ifdef LOCATE_MAX
+        if(idx < max_max_loc_count)
+        {
+            max_loc[idx].x = get_global_id(0);
+            max_loc[idx].y = get_global_id(1);
+        }
+#endif /* LOCATE_MAX */
+    }
+#endif /* COUNT_MIN_MAX */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter3x3.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter3x3.clembed
new file mode 100644
index 0000000..5634c9d
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter3x3.clembed

@@ -0,0 +1,873 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
+
+/** This function applies a non linear filter on a 3x3 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#ifdef MIN
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = row_reduce_min_3(tmp);
+#elif defined(MAX)
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = row_reduce_max_3(tmp);
+#elif defined(MEDIAN)
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar8  top    = vload8(0, offset(&src, 0, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar8  bottom = vload8(0, offset(&src, 0, 1));
+
+    // Apply respective filter
+#ifdef MIN
+    uchar8 tmp_middle = row_reduce_min_3(middle);
+    uchar8 out        = min(tmp_middle, min(top, bottom));
+#elif defined(MAX)
+    uchar8  tmp_middle = row_reduce_max_3(middle);
+    uchar8  out        = max(tmp_middle, max(top, bottom));
+#elif defined(MEDIAN)
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = middle.s01234567;
+    uchar8 p2  = middle.s12345678;
+    uchar8 p3  = middle.s23456789;
+    uchar8 p4  = bottom.s01234567;
+    uchar8 out = sort5(p0, p1, p2, p3, p4);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#ifdef MIN
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = row_reduce_min_3(tmp);
+#elif defined(MAX)
+    uchar16 tmp        = max(top, max(middle, bottom));
+    uchar8  out        = row_reduce_max_3(tmp);
+#elif defined(MEDIAN)
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter5x5.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter5x5.clembed
new file mode 100644
index 0000000..488b726
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter5x5.clembed

@@ -0,0 +1,1170 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 8 disks of diameter 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top.s01234567;
+    uchar8 p4  = top.s12345678;
+    uchar8 p5  = top.s23456789;
+    uchar8 p6  = top.s3456789A;
+    uchar8 p7  = top.s456789AB;
+    uchar8 p8  = middle.s01234567;
+    uchar8 p9  = middle.s12345678;
+    uchar8 p10 = middle.s23456789;
+    uchar8 p11 = middle.s3456789A;
+    uchar8 p12 = middle.s456789AB;
+    uchar8 p13 = bottom.s01234567;
+    uchar8 p14 = bottom.s12345678;
+    uchar8 p15 = bottom.s23456789;
+    uchar8 p16 = bottom.s3456789A;
+    uchar8 p17 = bottom.s456789AB;
+    uchar8 p18 = bottom2.s01234567;
+    uchar8 p19 = bottom2.s12345678;
+    uchar8 p20 = bottom2.s23456789;
+
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p4, p5);
+    SORT(p6, p7);
+    SORT(p8, p9);
+    SORT(p10, p11);
+    SORT(p12, p13);
+    SORT(p14, p15);
+    SORT(p16, p17);
+    SORT(p18, p19);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p4, p6);
+    SORT(p5, p7);
+    SORT(p8, p10);
+    SORT(p9, p11);
+    SORT(p12, p14);
+    SORT(p13, p15);
+    SORT(p16, p18);
+    SORT(p17, p19);
+    SORT(p1, p2);
+    SORT(p5, p6);
+    SORT(p0, p4);
+    SORT(p3, p7);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p8, p12);
+    SORT(p11, p15);
+    SORT(p17, p18);
+    SORT(p16, p20);
+    SORT(p1, p5);
+    SORT(p2, p6);
+    SORT(p9, p13);
+    SORT(p10, p14);
+    SORT(p0, p8);
+    SORT(p7, p15);
+    SORT(p17, p20);
+    SORT(p1, p4);
+    SORT(p3, p6);
+    SORT(p9, p12);
+    SORT(p11, p14);
+    SORT(p18, p20);
+    SORT(p0, p16);
+    SORT(p2, p4);
+    SORT(p3, p5);
+    SORT(p10, p12);
+    SORT(p11, p13);
+    SORT(p1, p9);
+    SORT(p6, p14);
+    SORT(p19, p20);
+    SORT(p3, p4);
+    SORT(p11, p12);
+    SORT(p1, p8);
+    SORT(p2, p10);
+    SORT(p5, p13);
+    SORT(p7, p14);
+    SORT(p3, p11);
+    SORT(p2, p8);
+    SORT(p4, p12);
+    SORT(p7, p13);
+    SORT(p1, p17);
+    SORT(p3, p10);
+    SORT(p5, p12);
+    SORT(p1, p16);
+    SORT(p2, p18);
+    SORT(p3, p9);
+    SORT(p6, p12);
+    SORT(p2, p16);
+    SORT(p3, p8);
+    SORT(p7, p12);
+    SORT(p5, p9);
+    SORT(p6, p10);
+    SORT(p4, p8);
+    SORT(p7, p11);
+    SORT(p3, p19);
+    SORT(p5, p8);
+    SORT(p7, p10);
+    SORT(p3, p18);
+    SORT(p4, p20);
+    SORT(p6, p8);
+    SORT(p7, p9);
+    SORT(p3, p17);
+    SORT(p5, p20);
+    SORT(p7, p8);
+    SORT(p3, p16);
+    SORT(p6, p20);
+    SORT(p5, p17);
+    SORT(p7, p20);
+    SORT(p4, p16);
+    SORT(p6, p18);
+    SORT(p5, p16);
+    SORT(p7, p19);
+    SORT(p7, p18);
+    SORT(p6, p16);
+    SORT(p7, p17);
+    SORT(p10, p18);
+    SORT(p7, p16);
+    SORT(p9, p17);
+    SORT(p8, p16);
+    SORT(p9, p16);
+    SORT(p10, p16);
+
+    return p10;
+}
+
+/** Sorting network to sort 8 boxes of size 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top2.s3456789A;
+    uchar8 p4  = top2.s456789AB;
+    uchar8 p5  = top.s01234567;
+    uchar8 p6  = top.s12345678;
+    uchar8 p7  = top.s23456789;
+    uchar8 p8  = top.s3456789A;
+    uchar8 p9  = top.s456789AB;
+    uchar8 p10 = middle.s01234567;
+    uchar8 p11 = middle.s12345678;
+    uchar8 p12 = middle.s23456789;
+    uchar8 p13 = middle.s3456789A;
+    uchar8 p14 = middle.s456789AB;
+    uchar8 p15 = bottom.s01234567;
+    uchar8 p16 = bottom.s12345678;
+    uchar8 p17 = bottom.s23456789;
+    uchar8 p18 = bottom.s3456789A;
+    uchar8 p19 = bottom.s456789AB;
+    uchar8 p20 = bottom2.s01234567;
+    uchar8 p21 = bottom2.s12345678;
+    uchar8 p22 = bottom2.s23456789;
+    uchar8 p23 = bottom2.s3456789A;
+    uchar8 p24 = bottom2.s456789AB;
+
+    SORT(p1, p2);
+    SORT(p0, p1);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p3, p4);
+    SORT(p4, p5);
+    SORT(p0, p3);
+    SORT(p2, p5);
+    SORT(p2, p3);
+    SORT(p1, p4);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p7, p8);
+    SORT(p6, p7);
+    SORT(p7, p8);
+    SORT(p10, p11);
+    SORT(p9, p10);
+    SORT(p10, p11);
+    SORT(p6, p9);
+    SORT(p8, p11);
+    SORT(p8, p9);
+    SORT(p7, p10);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p0, p6);
+    SORT(p4, p10);
+    SORT(p4, p6);
+    SORT(p2, p8);
+    SORT(p2, p4);
+    SORT(p6, p8);
+    SORT(p1, p7);
+    SORT(p5, p11);
+    SORT(p5, p7);
+    SORT(p3, p9);
+    SORT(p3, p5);
+    SORT(p7, p9);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p5, p6);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p12, p13);
+    SORT(p13, p14);
+    SORT(p16, p17);
+    SORT(p15, p16);
+    SORT(p16, p17);
+    SORT(p12, p15);
+    SORT(p14, p17);
+    SORT(p14, p15);
+    SORT(p13, p16);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p19, p20);
+    SORT(p18, p19);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p21, p23);
+    SORT(p22, p24);
+    SORT(p22, p23);
+    SORT(p18, p21);
+    SORT(p20, p23);
+    SORT(p20, p21);
+    SORT(p19, p22);
+    SORT(p22, p24);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p12, p18);
+    SORT(p16, p22);
+    SORT(p16, p18);
+    SORT(p14, p20);
+    SORT(p20, p24);
+    SORT(p14, p16);
+    SORT(p18, p20);
+    SORT(p22, p24);
+    SORT(p13, p19);
+    SORT(p17, p23);
+    SORT(p17, p19);
+    SORT(p15, p21);
+    SORT(p15, p17);
+    SORT(p19, p21);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p17, p18);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p0, p12);
+    SORT(p8, p20);
+    SORT(p8, p12);
+    SORT(p4, p16);
+    SORT(p16, p24);
+    SORT(p12, p16);
+    SORT(p2, p14);
+    SORT(p10, p22);
+    SORT(p10, p14);
+    SORT(p6, p18);
+    SORT(p6, p10);
+    SORT(p10, p12);
+    SORT(p1, p13);
+    SORT(p9, p21);
+    SORT(p9, p13);
+    SORT(p5, p17);
+    SORT(p13, p17);
+    SORT(p3, p15);
+    SORT(p11, p23);
+    SORT(p11, p15);
+    SORT(p7, p19);
+    SORT(p7, p11);
+    SORT(p11, p13);
+    SORT(p11, p12);
+    return p12;
+}
+
+/** This function applies a non linear filter on a 5x5 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Apply respective filter
+#ifdef MIN
+    uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+    uchar8  out = row_reduce_min_5(tmp);
+#elif defined(MAX)
+    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
+    uchar8  out = row_reduce_max_5(tmp);
+#elif defined(MEDIAN)
+    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar8  top2    = vload8(0, offset(&src, 0, -2));
+    uchar8  top     = vload8(0, offset(&src, 0, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar8  bottom  = vload8(0, offset(&src, 0, 1));
+    uchar8  bottom2 = vload8(0, offset(&src, 0, 2));
+
+    // Apply respective filter
+#ifdef MIN
+    uchar8 tmp_middle = row_reduce_min_5(middle);
+    uchar8 out        = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
+#elif defined(MAX)
+    uchar8  tmp_middle = row_reduce_max_5(middle);
+    uchar8  out        = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
+#elif defined(MEDIAN)
+    uchar8 p0  = top2;
+    uchar8 p1  = top;
+    uchar8 p2  = middle.s01234567;
+    uchar8 p3  = middle.s12345678;
+    uchar8 p4  = middle.s23456789;
+    uchar8 p5  = middle.s3456789A;
+    uchar8 p6  = middle.s456789AB;
+    uchar8 p7  = bottom;
+    uchar8 p8  = bottom2;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Shift top2 and bottom2 values
+    top2    = top2.s123456789ABCDEFF;
+    bottom2 = bottom2.s123456789ABCDEFF;
+
+    // Apply respective filter
+#ifdef MIN
+    uchar16 tmp_3     = min(top2, bottom2);
+    uchar16 tmp_5     = min(middle, min(top, bottom));
+    uchar8  tmp_3_red = row_reduce_min_3(tmp_3);
+    uchar8  tmp_5_red = row_reduce_min_5(tmp_5);
+    uchar8  out       = min(tmp_3_red, tmp_5_red);
+#elif defined(MAX)
+    uchar16 tmp_3      = max(top2, bottom2);
+    uchar16 tmp_5      = max(middle, max(top, bottom));
+    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
+    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
+    uchar8  out        = max(tmp_3_red, tmp_5_red);
+#elif defined(MEDIAN)
+    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
+#else /* MIN or MAX or MEDIAN */
+#error "Unsupported filter function"
+#endif /* MIN or MAX or MEDIAN */
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter_helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter_helpers.hembed
new file mode 100644
index 0000000..48633ed
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/non_linear_filter_helpers.hembed

@@ -0,0 +1,149 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/nonmax.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/nonmax.clembed
new file mode 100644
index 0000000..ca72061
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/nonmax.clembed

@@ -0,0 +1,613 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/normalization_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/normalization_layer.clembed
new file mode 100644
index 0000000..27d25ae
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/normalization_layer.clembed

@@ -0,0 +1,797 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#define LOAD_OP(offset, ptr) vload4(offset, ptr)
+#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_slice = get_global_id(2);
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_col = get_global_id(0) << 2;
+    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
+    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_cols = get_global_id(1);
+    const int first_col    = max(-(int)RADIUS, -current_cols);
+    const int last_col     = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols);
+
+#if defined(IN_MAP_2D)
+    const int current_rows = get_global_id(2);
+    const int first_row    = max(-(int)RADIUS, -current_rows);
+    const int last_row     = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.clembed
new file mode 100644
index 0000000..4c71f2a
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.clembed

@@ -0,0 +1,677 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(0);
+
+    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.clembed
new file mode 100644
index 0000000..6dfcc52
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.clembed

@@ -0,0 +1,701 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    float16 curr_mean_flt = (float16)(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 curr_std_flt = (float16)(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+    data_flt         = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+    // Perform normalization
+    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(0);
+
+    float16 curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+    data_flt         = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+    // Perform normalization
+    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.clembed
new file mode 100644
index 0000000..73244d1
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.clembed

@@ -0,0 +1,1119 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
+
+/*
+ *The criteria for lost tracking is that the spatial gradient matrix has:
+ * - Determinant less than DETERMINANT_THR
+ * - or minimum eigenvalue is smaller then EIGENVALUE_THR
+ *
+ * The thresholds for the determinant and the minimum eigenvalue is
+ * defined by the OpenVX spec
+ *
+ * Note: Also lost tracking happens when the point tracked coordinate is outside
+ * the image coordinates
+ *
+ * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
+ */
+
+/* Internal Lucas-Kanade Keypoint struct */
+typedef struct InternalKeypoint
+{
+    float x;               /**< The x coordinate. */
+    float y;               /**< The y coordinate. */
+    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float dummy;           /**< Dummy member for alignment. */
+} InternalKeypoint;
+
+/** Threshold for the determinant. Used for lost tracking criteria */
+#define DETERMINANT_THR 1.0e-07f
+
+/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
+#define EIGENVALUE_THR 1.0e-04f
+
+/** Constants used for Lucas-Kanade Algorithm */
+#define W_BITS (14)
+#define FLT_SCALE (1.0f / (float)(1 << 20))
+#define D0 ((float)(1 << W_BITS))
+#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
+
+/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
+ *
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level(
+    __global float4 *old_points_internal,
+    __global float4 *new_points_internal,
+    const float      scale)
+{
+    int idx = get_global_id(0);
+
+    // Get old and new keypoints
+    float4 old_point = old_points_internal[idx];
+    float4 new_point = new_points_internal[idx];
+
+    // Scale accordingly with the pyramid_scale
+    old_point.xy *= (float2)(2.0f);
+    new_point.xy *= (float2)(2.0f);
+
+    old_points_internal[idx] = old_point;
+    new_points_internal[idx] = new_point;
+}
+
+/** Initializes the internal new points array when the level of pyramid is equal to max.
+ *
+ * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max(
+    __global Keypoint *old_points,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint old_point = old_points[idx];
+
+    // Get old keypoint to track
+    InternalKeypoint old_point_internal;
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = old_point_internal;
+}
+
+/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
+ *
+ * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale                Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max_initial_estimate(
+    __global Keypoint *old_points,
+    __global Keypoint *new_points_estimates,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint         old_point          = old_points[idx];
+    Keypoint         new_point_estimate = new_points_estimates[idx];
+    InternalKeypoint old_point_internal;
+    InternalKeypoint new_point_internal;
+
+    // Get old keypoint to track
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Get new keypoint to track
+    new_point_internal.x               = new_point_estimate.x * scale;
+    new_point_internal.y               = new_point_estimate.y * scale;
+    new_point_internal.tracking_status = new_point_estimate.tracking_status;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = new_point_internal;
+}
+
+/** Truncates the coordinates stored in new_points array
+ *
+ * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
+ * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
+ */
+__kernel void finalize(
+    __global InternalKeypoint *new_points_internal,
+    __global Keypoint *new_points)
+{
+    int idx = get_global_id(0);
+
+    // Load internal keypoint
+    InternalKeypoint new_point_internal = new_points_internal[idx];
+
+    // Calculate output point
+    Keypoint new_point;
+    new_point.x               = round(new_point_internal.x);
+    new_point.y               = round(new_point_internal.y);
+    new_point.strength        = 0.f;
+    new_point.scale           = 0.f;
+    new_point.orientation     = 0.f;
+    new_point.tracking_status = new_point_internal.tracking_status;
+    new_point.error           = 0.f;
+
+    // Store new point
+    new_points[idx] = new_point;
+}
+
+/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
+ *
+ * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
+ * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
+ * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
+ * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
+ * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
+ * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
+ * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
+ * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
+ * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
+ * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
+ * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
+ */
+void __kernel lktracker_stage0(
+    IMAGE_DECLARATION(old_image),
+    IMAGE_DECLARATION(old_scharr_gx),
+    IMAGE_DECLARATION(old_scharr_gy),
+    __global float4 *old_points,
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0)
+{
+    int idx = get_global_id(0);
+
+    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
+    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
+    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
+
+    // Get old keypoint
+    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
+
+    // Get the floor value
+    float2 iold_keypoint = floor(old_keypoint);
+
+    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0.0f;
+        }
+
+        // Not valid coordinate. It sets min_eig to 0.0f
+        coeff[idx].s3 = 0.0f;
+
+        return;
+    }
+
+    // Compute weight for the bilinear interpolation
+    float2 ab = old_keypoint - iold_keypoint;
+
+    // Weight used for Bilinear-Interpolation on Scharr images
+    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
+    // w_scharr.s1 = ab.x * (1.0f - ab.y)
+    // w_scharr.s2 = (1.0f - ab.x) * ab.y
+    // w_scharr.s3 = ab.x * ab.y
+
+    float4 w_scharr;
+    w_scharr.s3  = ab.x * ab.y;
+    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
+    w_scharr.s12 = ab - (float2)w_scharr.s3;
+
+    // Weight used for Bilinear-Interpolation on Old and New images
+    // w.s0 = round(w_scharr.s0 * D0)
+    // w.s1 = round(w_scharr.s1 * D0)
+    // w.s2 = round(w_scharr.s2 * D0)
+    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+    float4 w;
+    w    = round(w_scharr * (float4)D0);
+    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    int4 iG = (int4)0;
+
+    // Window offset
+    int window_offset = idx * window_dimension_pow2;
+
+    // Compute Spatial Gradient Matrix G
+    for(ushort ky = 0; ky < window_dimension; ++ky)
+    {
+        int offset_y = iold_keypoint.y + ky;
+        for(ushort kx = 0; kx < window_dimension; ++kx)
+        {
+            int    offset_x = iold_keypoint.x + kx;
+            float4 px;
+
+            // Load values from old_image for computing the bilinear interpolation
+            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
+                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
+
+            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
+            float4 old_i;
+
+            // Compute bilinear interpolation (with D1 scale factor) for ival
+            old_i.s0 = dot(px, w) * D1;
+
+            // Load values from old_scharr_gx for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for ixval
+            old_i.s1 = dot(px, w_scharr);
+
+            // Load values from old_scharr_gy for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for iyval
+            old_i.s2 = dot(px, w_scharr);
+
+            // Rounding (it could be omitted. Used just for matching the VX implementation)
+            int4 iold = convert_int4(round(old_i));
+
+            // Accumulate values in the Spatial Gradient Matrix
+            iG.s0 += (int)(iold.s1 * iold.s1);
+            iG.s1 += (int)(iold.s1 * iold.s2);
+            iG.s2 += (int)(iold.s2 * iold.s2);
+
+            // Store ival, ixval and iyval
+            iold_val[window_offset + kx] = convert_short4(iold);
+        }
+        window_offset += window_dimension;
+    }
+
+    // Scale iA11, iA12 and iA22
+    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
+
+    // Compute minimum eigen value
+    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
+
+    // Store A11. A11, A22 and min_eig
+    coeff[idx] = G;
+}
+
+/** Computes the motion vector for a given keypoint
+ *
+ * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
+ * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
+ * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
+ * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
+ * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                             The half size of the window on which to perform the algorithm
+ * @param[in]      num_iterations                          The maximum number of iterations
+ * @param[in]      epsilon                                 The value for terminating the algorithm.
+ * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
+ * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
+ */
+void __kernel lktracker_stage1(
+    IMAGE_DECLARATION(new_image),
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const int        num_iterations,
+    const float      epsilon,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0,
+    const int        term_epsilon)
+{
+    int   idx       = get_global_id(0);
+    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    float4 G = coeff[idx];
+
+    // Determinant
+    float D = G.s0 * G.s2 - G.s1 * G.s1;
+
+    // Check if it is a good point to track
+    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0;
+        }
+
+        return;
+    }
+
+    // Compute inverse
+    //D = native_recip(D);
+    D = 1.0 / D;
+
+    // Get new keypoint
+    float2 new_keypoint = new_points[idx].xy - (float)half_window;
+
+    // Get new point
+    float2 out_new_point = new_points[idx].xy;
+
+    // Keep delta obtained in the previous iteration
+    float2 prev_delta = (float2)0.0f;
+
+    int j = 0;
+    while(j < num_iterations)
+    {
+        // Get the floor value
+        float2 inew_keypoint = floor(new_keypoint);
+
+        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
+        {
+            if(level0 == 1)
+            {
+                // Invalidate tracked point as we are at level 0
+                new_points[idx].s2 = 0.0f;
+            }
+            else
+            {
+                new_points[idx].xy = out_new_point;
+            }
+
+            return;
+        }
+
+        // Compute weight for the bilinear interpolation
+        float2 ab = new_keypoint - inew_keypoint;
+
+        // Weight used for Bilinear-Interpolation on Old and New images
+        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
+        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
+        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
+        // w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+        float4 w;
+        w.s3  = ab.x * ab.y;
+        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
+        w.s12 = ab - (float2)w.s3;
+        w     = round(w * (float4)D0);
+        w.s3  = D0 - w.s0 - w.s1 - w.s2;
+
+        // Mismatch vector
+        int2 ib = 0;
+
+        // Old val offset
+        int old_val_offset = idx * window_dimension_pow2;
+
+        for(int ky = 0; ky < window_dimension; ++ky)
+        {
+            for(int kx = 0; kx < window_dimension; ++kx)
+            {
+                // ival, ixval and iyval have been computed in the previous stage
+                int4 old_ival = convert_int4(iold_val[old_val_offset]);
+
+                // Load values from old_image for computing the bilinear interpolation
+                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
+                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
+
+                // Compute bilinear interpolation on new image
+                int jval = (int)round(dot(px, w) * D1);
+
+                // Compute luminance difference
+                int diff = (int)(jval - old_ival.s0);
+
+                // Accumulate values in mismatch vector
+                ib += (diff * old_ival.s12);
+
+                // Update old val offset
+                old_val_offset++;
+            }
+        }
+
+        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
+
+        // Optical Flow
+        float2 delta;
+
+        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
+        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
+
+        // Update new point coordinate
+        new_keypoint += delta;
+
+        out_new_point = new_keypoint + (float2)half_window;
+
+        if(term_epsilon == 1)
+        {
+            float mag2 = dot(delta, delta);
+
+            if(mag2 <= epsilon)
+            {
+                new_points[idx].xy = out_new_point;
+
+                return;
+            }
+        }
+
+        // Check convergence analyzing the previous delta
+        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
+        {
+            out_new_point -= delta * (float2)0.5f;
+
+            new_points[idx].xy = out_new_point;
+
+            return;
+        }
+
+        // Update previous delta
+        prev_delta = delta;
+
+        j++;
+    }
+
+    new_points[idx].xy = out_new_point;
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/pad_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/pad_layer.clembed
new file mode 100644
index 0000000..87673ec
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/pad_layer.clembed

@@ -0,0 +1,796 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
+ *       -DNUM_THREADS_TO_SKIP_X compile flag, e.g. -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                 ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+                                )
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    uint cond = 0;
+
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
+    if(cond)
+    {
+        Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+        VSTORE(VEC_SIZE)
+        ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+    }
+    else
+    {
+        Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+        Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(NUM_THREADS_TO_SKIP_X)
+        /* In case the pad left is greater than the vector size, and we are past the threads operating solely on pad values,
+         * the input pointer must be brought back along the X axis to start from the first non-pad values.
+         *
+         * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+         *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) < PAD_X_BEFORE
+         *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, 3)) < PAD_X_BEFORE
+         *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x * VEC_SIZE) == 4) values, reading |4 5|
+         *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and check that it is >= to the current x
+         *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying this constant by the input's step along the X axis
+         *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it will read the desired values |0 1|
+         */
+        src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+        src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+        src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
+
+        VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+        VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+        VEC_INT cond   = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+        cond |= (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+        VSTORE(VEC_SIZE)
+        (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0, (__global DATA_TYPE *)dst.ptr);
+    }
+}
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
+                                          TENSOR3D_DECLARATION(dst))
+{
+    // Get current thread position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Define conditions based on the thread X position w.r.t. pad left and right
+    const int x_out_first         = x * VEC_SIZE;
+    const int x_out_last          = x_out_first + VEC_SIZE;
+    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
+    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+    // Calculate base pointers
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Calculate input tensor's offset based on the defined conditions
+    int x_offset = 0;
+    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+    int neg_offs = x_offset < 0;
+    x_offset     = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+    // Load input values from the computed offset
+    int y_in = y;
+    int z_in = z;
+#if defined(PAD_Y_BEFORE)
+    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+    // Choose rearrangement policy based on the defined conditions
+    src_vals = select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals = select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+    src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+    // Store
+    VSTORE(VEC_SIZE)
+    (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/permute.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/permute.clembed
new file mode 100644
index 0000000..e8af181
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/permute.clembed

@@ -0,0 +1,617 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_float.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_float.clembed
new file mode 100644
index 0000000..c25c852
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_float.clembed

@@ -0,0 +1,688 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication
+#ifdef DATA_TYPE_FLOAT
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT(in1_data * in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else  /* DATA_TYPE_FLOAT */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+    // Store result
+    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    float2 vin1 = vload2(0, (__global float *)in1.ptr);
+    float2 vin2 = vload2(0, (__global float *)in2.ptr);
+
+    // Perform complex multiplication
+    float2 res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+    // Store result
+    vstore2(res, 0, (__global float *)out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_int.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_int.clembed
new file mode 100644
index 0000000..9c07f5f
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/pixelwise_mul_int.clembed

@@ -0,0 +1,717 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
+ */
+__kernel void pixelwise_mul_int(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const uint scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication and store result
+    vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+
+    // Dequantize
+#if defined(OFFSET_IN1)
+    in_a -= (VEC_INT)((int)OFFSET_IN1);
+#endif // defined(OFFSET_IN1)
+#if defined(OFFSET_IN2)
+    in_b -= (VEC_INT)((int)OFFSET_IN2);
+#endif // defined(OFFSET_IN2)
+    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+#if defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+#else  // defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
+#endif // defined(OFFSET_OUT)
+    const VEC_TYPE res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer.clembed
new file mode 100644
index 0000000..07fcd12
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer.clembed

@@ -0,0 +1,1141 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(1.f / y))
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
+        data00 = POW2_OP(data00, 4);                                                                                                                 \
+        data01 = POW2_OP(data01, 2);                                                                                                                 \
+        data10 = POW2_OP(data10, 4);                                                                                                                 \
+        data11 = POW2_OP(data11, 2);                                                                                                                 \
+        data20 = POW2_OP(data20, 4);                                                                                                                 \
+        data21 = POW2_OP(data21, 2);                                                                                                                 \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
+        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
+        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
+        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
+        data00               = POW2_OP(data00, 8);                                                                                                   \
+        data01               = POW2_OP(data01, 1);                                                                                                   \
+        data10               = POW2_OP(data10, 8);                                                                                                   \
+        data11               = POW2_OP(data11, 1);                                                                                                   \
+        data20               = POW2_OP(data20, 8);                                                                                                   \
+        data21               = POW2_OP(data21, 1);                                                                                                   \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
+    ({                                                                                                                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
+        data00 = POW2_OP(data00, 8);                                                                                                         \
+        data01 = POW2_OP(data01, 4);                                                                                                         \
+        data10 = POW2_OP(data10, 8);                                                                                                         \
+        data11 = POW2_OP(data11, 4);                                                                                                         \
+        data20 = POW2_OP(data20, 8);                                                                                                         \
+        data21 = POW2_OP(data21, 4);                                                                                                         \
+        \
+        data00 = POOL_OP(data00, data10);                                                                                                    \
+        data01 = POOL_OP(data01, data11);                                                                                                    \
+        data00 = POOL_OP(data00, data20);                                                                                                    \
+        data01 = POOL_OP(data01, data21);                                                                                                    \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
+    })
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 2);
+    data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average or l2 pooling
+    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+/** Performs a pooling function of pool size equal to 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 3);
+    data1 = POW2_OP(data1, 3);
+    data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    data0             = POOL_OP(data0, data2);
+    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+#if defined(POOLING3x3)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+    int        start_y = get_global_id(1) * stride_y - pad_y;
+    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+    const int  end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max((int4)0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_optimized_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    res;
+
+    // Perform pooling 3x3 for 4 output elements
+    POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N  (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            sdata = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+                                       const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(1) * stride_x - pad_x;
+#if defined(DST_DEPTH)
+    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else  /* defined(DST_DEPTH) */
+    int       start_y    = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
+
+#if !defined(EXCLUDE_PADDING)
+    upper_bound_w += pad_x;
+    upper_bound_h += pad_y;
+#endif /* defined(EXCLUDE_PADDING) */
+    const int end_x = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata = INITIAL_VALUE;
+
+    const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else  /* defined(DST_DEPTH) */
+    const int idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
+
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
+            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+
+#if defined(DST_DEPTH)
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else  /* defined(DST_DEPTH) */
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, CONVERT(data0, VEC_DATA_TYPE(ACC_DATA_TYPE, 8)));
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    vdata = SQRT_OP(vdata);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer_quantized.clembed
new file mode 100644
index 0000000..4a4271a
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/pooling_layer_quantized.clembed

@@ -0,0 +1,808 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#define DIV_OP_NHWC(x, y) (convert_float8(x) * (float8)(1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = INITIAL_VALUE;
+    int  sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_TYPE(8)
+            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int data0      = convert_int(data);
+            sdata          = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
+    res          = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_q8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global DATA_TYPE *)output.ptr = result_q8;
+}
+
+int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+                             const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(1) * stride_x - pad_x;
+#if defined(DST_DEPTH)
+    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else  /* defined(DST_DEPTH) */
+    int       start_y    = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
+
+    const int end_x = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = min(start_y + pool_size_y, upper_bound_h);
+
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
+
+    int8 vdata = INITIAL_VALUE;
+
+    const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else  /* defined(DST_DEPTH) */
+    const int idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
+
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
+            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+
+#if defined(DST_DEPTH)
+            VEC_TYPE(8)
+            data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else  /* defined(DST_DEPTH) */
+            VEC_TYPE(8)
+            data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
+        }
+    }
+
+#if defined(POOL_AVG)
+    // Divide by pool region in case of average pooling
+    vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
+#endif /* defined(POOL_AVG) */
+
+    VEC_TYPE(8)
+    out_q8 = CONVERT(vdata, VEC_TYPE(8));
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    // Store result
+    vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/prior_box_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/prior_box_layer.clembed
new file mode 100644
index 0000000..4e80e97
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/prior_box_layer.clembed

@@ -0,0 +1,682 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/**  Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx                                  Index to write to
+ * @param[in]  center_x                             Center value of the x axis
+ * @param[in]  center_y                             Center value of the y axis
+ * @param[in]  box_width                            Prior box width
+ * @param[in]  box_height                           Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+    float xmin = (center_x - box_width / 2.f) / WIDTH;
+    float ymin = (center_y - box_height / 2.f) / HEIGHT;
+    float xmax = (center_x + box_width / 2.f) / WIDTH;
+    float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+    xmin = clamp(xmin, 0.f, 1.f);
+    ymin = clamp(ymin, 0.f, 1.f);
+    xmax = clamp(xmax, 0.f, 1.f);
+    ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+    // Store result
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min_size                             Prior box min size
+ * @param[in]  min_idx                              Index of the min vector
+ * @param[in]  idx                                  Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+    float box_width  = min_size;
+    float box_height = min_size;
+    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+    idx += 4;
+
+    if(max_size > 0)
+    {
+        box_width  = sqrt(min_size * max[min_idx]);
+        box_height = box_width;
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+    {
+        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+        {
+            continue;
+        }
+        box_width  = min_size * sqrt(aspect_ratios[i]);
+        box_height = min_size * rsqrt(aspect_ratios[i]);
+
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+
+    return idx;
+}
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min                                  The minimum values
+ * @param[in]  max                                  The maximum_values
+ * @param[in]  aspect_ratios                        The aspect ratio values
+ * @param[in]  min_size                             The minimum values size
+ * @param[in]  max_size                             The maximum_values values size
+ * @param[in]  aspect_ratios_size                   The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+                                   unsigned int aspect_ratios_size)
+{
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    int idx = 0;
+    for(unsigned int i = 0; i < min_size; ++i)
+    {
+        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+    }
+
+    // Store variances
+    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+    {
+        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/quantization_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/quantization_layer.clembed
new file mode 100644
index 0000000..ecf37be
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/quantization_layer.clembed

@@ -0,0 +1,651 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
+
+/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void quantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+#if defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
+#else // defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
+#endif // defined(IS_FLOAT)
+
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/range.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/range.clembed
new file mode 100644
index 0000000..5f76574
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/range.clembed

@@ -0,0 +1,688 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range(
+    VECTOR_DECLARATION(out))
+{
+    uint           id      = get_global_id(0) * VECTOR_SIZE;
+    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    DATA_TYPE seq;
+    seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
+
+    *((__global DATA_TYPE *)dst_ptr) = seq;
+#else // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    seq;
+
+    seq.s0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+#if VECTOR_SIZE > 1
+    seq.s1 = seq.s0 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 2
+    seq.s2 = seq.s1 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 3
+    seq.s3 = seq.s2 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 4
+    seq.s4 = seq.s3 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 5
+    seq.s5 = seq.s4 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 6
+    seq.s6 = seq.s5 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 7
+    seq.s7 = seq.s6 + (DATA_TYPE)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+    VSTORE(VECTOR_SIZE)
+    (seq, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif //VECTOR_SIZE == 1
+}
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range_quantized(
+    VECTOR_DECLARATION(out))
+{
+    size_t         id      = get_global_id(0) * VECTOR_SIZE;
+    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    float seq;
+    seq                          = (float)START + (float)id * (float)STEP;
+    seq                          = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+    seq                          = max(0.0f, min(seq, 255.0f));
+    *((__global uchar *)dst_ptr) = CONVERT_SAT(CONVERT_DOWN(seq, int), uchar);
+#else // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(float, VECTOR_SIZE)
+    seq;
+    seq.s0 = (float)START + id * (float)STEP;
+#if VECTOR_SIZE > 1
+    seq.s1 = seq.s0 + (float)STEP;
+#if VECTOR_SIZE > 2
+    seq.s2 = seq.s1 + (float)STEP;
+#if VECTOR_SIZE > 3
+    seq.s3 = seq.s2 + (float)STEP;
+#if VECTOR_SIZE > 4
+    seq.s4 = seq.s3 + (float)STEP;
+#if VECTOR_SIZE > 5
+    seq.s5 = seq.s4 + (float)STEP;
+#if VECTOR_SIZE > 6
+    seq.s6 = seq.s5 + (float)STEP;
+#if VECTOR_SIZE > 7
+    seq.s7 = seq.s6 + (float)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+    seq    = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+    seq    = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+    VEC_DATA_TYPE(uchar, VECTOR_SIZE)
+    res = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(uchar, VECTOR_SIZE));
+    VSTORE(VECTOR_SIZE)
+    (res, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif // VECTOR_SIZE == 1
+}
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/reduction_operation.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/reduction_operation.clembed
new file mode 100644
index 0000000..c22831b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/reduction_operation.clembed

@@ -0,0 +1,2082 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((int16)0, (int16)-1, x > y)
+#define ISLESS(x, y) select((int16)0, (int16)-1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+/** Calculate square sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return square sum of vector.
+ */
+inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in *= in;
+
+    in.s01234567 += in.s89ABCDEF;
+    in.s0123 += in.s4567;
+    in.s01 += in.s23;
+
+    return (in.s0 + in.s1);
+}
+
+/** Calculate sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return sum of vector.
+ */
+inline DATA_TYPE sum(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in.s01234567 += in.s89ABCDEF;
+    in.s0123 += in.s4567;
+    in.s01 += in.s23;
+
+    return (in.s0 + in.s1);
+}
+
+/** Calculate product of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return product of vector.
+ */
+inline DATA_TYPE product(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in.s01234567 *= in.s89ABCDEF;
+    in.s0123 *= in.s4567;
+    in.s01 *= in.s23;
+
+    return (in.s0 * in.s1);
+}
+#if defined(OPERATION)
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: same as @p src_ptr
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
+ */
+__kernel void reduction_operation_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(partial_res),
+    __local DATA_TYPE *local_results)
+{
+    Image src         = CONVERT_TO_IMAGE_STRUCT(src);
+    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+    unsigned int lsize = get_local_size(0);
+    unsigned int lid   = get_local_id(0);
+
+    for(unsigned int y = 0; y < get_local_size(1); ++y)
+    {
+        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Perform parallel reduction
+        for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+        {
+            if(lid < i)
+            {
+#if defined(PROD)
+                local_results[lid] *= local_results[lid + i];
+#else  // !defined(PROD)
+                local_results[lid] += local_results[lid + i];
+#endif // defined(PROD)
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lid == 0)
+        {
+#if defined(MEAN) && defined(WIDTH)
+            if(y == get_local_size(1) - 1)
+            {
+                local_results[0] /= WIDTH;
+            }
+#endif // defined(MEAN) && defined(WIDTH)
+            ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+        }
+    }
+}
+#endif // defined(OPERATION)
+
+#if defined(WIDTH)
+/** This kernel performs reduction on x-axis. (Non parallel)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_non_parallel_x(
+    VECTOR_DECLARATION(src),
+    VECTOR_DECLARATION(output))
+{
+    Vector src    = CONVERT_TO_VECTOR_STRUCT(src);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, 0)), DATA_TYPE_PROMOTED);
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    for(unsigned int x = 1; x < WIDTH; ++x)
+    {
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, x)), DATA_TYPE_PROMOTED);
+#if defined(MIN)
+        res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
+#elif defined(MAX)
+        res                             = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
+#elif defined(PROD)
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+#else  // defined(SUM))
+        res += in;
+#endif // defined(MAX) || defined(MIN) || defined(PROD)
+    }
+
+    // Store result
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (WIDTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(MIN) || defined(MAX)
+    *((__global DATA_TYPE_PROMOTED *)output.ptr) = res;
+#else  // !(defined(MIN) || defined(MAX))
+    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
+#endif // defined(MIN) || defined(MAX)
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(output))
+{
+    Image src    = CONVERT_TO_IMAGE_STRUCT(src);
+    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int y = 1; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res                             = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= HEIGHT;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (HEIGHT - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(COMPLEX)
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int z = 1; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(COMPLEX)
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res                             = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else // !defined(PROD)
+        res += in;
+#if defined(COMPLEX)
+        res1 += in1;
+#endif // defined(COMPLEX)
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= DEPTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (DEPTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#if defined(COMPLEX)
+    vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
+#endif // defined(COMPLEX)
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int w = 1; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res                             = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif //defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= BATCH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (BATCH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/remap.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/remap.clembed
new file mode 100644
index 0000000..23a2043
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/remap.clembed

@@ -0,0 +1,1353 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+    map_coords += (float8)(0.5f);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/reorg_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/reorg_layer.clembed
new file mode 100644
index 0000000..a27d192
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/reorg_layer.clembed

@@ -0,0 +1,659 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(0);
+    int yo = get_global_id(1);
+    int zo = get_global_id(2);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/repeat.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/repeat.hembed
new file mode 100644
index 0000000..f5aa99d
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/repeat.hembed

@@ -0,0 +1,87 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+/** Macros that help in loop unrolling */
+//Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(1, P_A, P_B, P_C);       \
+    REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(2, P_A, P_B, P_C);       \
+    REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(3, P_A, P_B, P_C);       \
+    REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(4, P_A, P_B, P_C);       \
+    REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(5, P_A, P_B, P_C);       \
+    REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(6, P_A, P_B, P_C);       \
+    REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(7, P_A, P_B, P_C);       \
+    REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(8, P_A, P_B, P_C);       \
+    REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(9, P_A, P_B, P_C);        \
+    REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(A, P_A, P_B, P_C);        \
+    REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(B, P_A, P_B, P_C);        \
+    REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(C, P_A, P_B, P_C);        \
+    REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(D, P_A, P_B, P_C);        \
+    REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(E, P_A, P_B, P_C);        \
+    REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(F, P_A, P_B, P_C);        \
+    REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+#endif // ARM_COMPUTE_REPEAT_H
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/reshape_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/reshape_layer.clembed
new file mode 100644
index 0000000..c5869c7
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/reshape_layer.clembed

@@ -0,0 +1,613 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+                            TENSOR3D_DECLARATION(output),
+                            int2 input_shape,
+                            int2 output_shape)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+    // Linearize index
+    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+    // Translate to output
+    int3 out_id;
+    out_id.x = linear_idx % output_shape.x;
+    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+    out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+    // Store result
+    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/reverse.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/reverse.clembed
new file mode 100644
index 0000000..16a145d
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/reverse.clembed

@@ -0,0 +1,645 @@
+R"(
+
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
+ * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+                      VECTOR_DECLARATION(axis),
+                      TENSOR4D_DECLARATION(dst),
+                      const uint width,
+                      const uint height,
+                      const uint depth,
+                      const uint batches)
+{
+    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
+
+    const uint x_in = get_global_id(0);
+    const uint y_in = get_global_id(1);
+    const uint z_in = get_global_id(2) % depth;
+    const uint w_in = get_global_id(2) / depth;
+
+    const uint4 dims       = (uint4)(0, 1, 2, 3);
+    int4        to_reverse = (int4)(0, 0, 0, 0);
+#if NUM_REVERSE_DIMS == 1
+    const uint index = *((__global uint *)axis.ptr);
+    to_reverse       = (uint4)index == dims;
+#elif NUM_REVERSE_DIMS == 2
+    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
+    const uint index2     = *((__global uint *)axis.ptr + 2);
+    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
+#else  /* NUM_REVERSE_DIMS == 3 */
+    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer.clembed
new file mode 100644
index 0000000..2c66c50
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer.clembed

@@ -0,0 +1,743 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2       region_start = spatial_indx * bin_size + roi_anchor;
+    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    const float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer_quantized.clembed
new file mode 100644
index 0000000..7780126
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_align_layer_quantized.clembed

@@ -0,0 +1,1238 @@
+R"(
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+
+            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
+        }
+    }
+
+    const float res_f32 = sum / (grid_size_x * grid_size_y);
+    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
+ *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer_quantized(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
+    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
+    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
+    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2 region_start = spatial_indx * bin_size + roi_anchor;
+    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/roi_pooling_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_pooling_layer.clembed
new file mode 100644
index 0000000..f7e618d
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/roi_pooling_layer.clembed

@@ -0,0 +1,713 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if DATA_SIZE == 32
+#define VEC_SIZE 4
+#define VEC_MAX vec4_max
+#elif DATA_SIZE == 16
+#define VEC_SIZE 8
+#define VEC_MAX vec8_max
+#else /* DATA_SIZE not equals 32 or 16 */
+#error "Unsupported data size"
+#endif /* DATA_SIZE == 32 */
+
+inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp = fmax(vec.lo, vec.hi);
+    return fmax(temp.x, temp.y);
+}
+
+inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp = fmax(vec.lo, vec.hi);
+    return vec4_max(temp);
+}
+
+/** Performs a roi pooling on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return A max pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
+{
+    // Iterate through the pooling region
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return (DATA_TYPE)0;
+    }
+    else
+    {
+        int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(-FLT_MAX);
+        for(int j = region_start_y; j < region_end_y; ++j)
+        {
+            int i = region_start_x;
+            for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
+            {
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
+                curr_max = fmax(val, curr_max);
+            }
+            for(; i < region_end_x; ++i)
+            {
+                DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
+                curr_max      = fmax(curr_max, val);
+            }
+        }
+        return (DATA_TYPE)VEC_MAX(curr_max);
+    }
+}
+
+/** Performs a roi pooling function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
+ */
+__kernel void roi_pooling_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pw = get_global_id(2);
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi               = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
+    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
+                                                                                                       region_start.x,
+                                                                                                       region_end.x,
+                                                                                                       region_start.y,
+                                                                                                       region_end.y, pz);
+    }
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/scale.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/scale.clembed
new file mode 100644
index 0000000..03a7cf6
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/scale.clembed

@@ -0,0 +1,1497 @@
+R"(
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = clamp_to_border_with_size(transform_nearest(get_current_coords(), r), input_width, input_height, BORDER_SIZE);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear(get_current_coords(), r);
+    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+#if defined(DEPTH_OUT)
+/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
+    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
+}
+
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+    const float new_xf      = floor(new_x);
+    const float new_yf      = floor(new_y);
+    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
+    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
+    float       clamped_x_  = clamped_x;
+    float       clamped_x1_ = clamped_x1;
+    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
+    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+#endif /* BORDER_MODE_REPLICATE */
+
+    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+
+    const float a  = new_x - new_xf;
+    const float b  = 1.f - a;
+    const float a1 = new_y - new_yf;
+    const float b1 = 1.f - a1;
+    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
+
+    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
+}
+#endif /* defined(DEPTH_OUT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/scale_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/scale_quantized.clembed
new file mode 100644
index 0000000..98ba849
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/scale_quantized.clembed

@@ -0,0 +1,2369 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size_quantized(coords, width, height, 1);
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords_quantized(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords_quantized()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] border_size   Border size
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
+                                                                                    const float scale, const int offset_qasymm)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const int16 t = (int16)(
+                        /* tl, tr, bl, br */
+                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+
+    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
+
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
+                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
+                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
+                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
+
+    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    return res;
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
+{
+    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
+    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+#if defined(DEPTH_OUT)
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+    const float new_xf      = floor(new_x);
+    const float new_yf      = floor(new_y);
+    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
+    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
+    float       clamped_x_  = clamped_x;
+    float       clamped_x1_ = clamped_x1;
+    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
+    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+#endif /* BORDER_MODE_REPLICATE */
+
+    int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+
+    const float  a      = new_x - new_xf;
+    const float  b      = 1.f - a;
+    const float  a1     = new_y - new_yf;
+    const float  b1     = 1.f - a1;
+    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+
+    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+
+    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
+
+    *((__global DATA_TYPE *)out.ptr) = res;
+}
+#endif /* defined(DEPTH_OUT) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/scharr_filter.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/scharr_filter.clembed
new file mode 100644
index 0000000..71e95a6
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/scharr_filter.clembed

@@ -0,0 +1,667 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This OpenCL kernel computes Scharr3x3.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scharr3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif /* GRAD_Y */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif /* GRAD_Y */
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    gy += left * (short8)(-3);
+    gy += middle * (short8)(-10);
+    gy += right * (short8)(-3);
+#endif /* GRAD_Y */
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-10);
+    gx += right * (short8)(+10);
+#endif /* GRAD_X */
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    gy += left * (short8)(+3);
+    gy += middle * (short8)(+10);
+    gy += right * (short8)(+3);
+#endif /* GRAD_Y */
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/select.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/select.clembed
new file mode 100644
index 0000000..e5dc710
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/select.clembed

@@ -0,0 +1,759 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has the same rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  c_step_y                          c_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_same_rank(
+    TENSOR3D_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D c_t   = CONVERT_TO_TENSOR3D_STRUCT(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_2(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(1);
+
+    // Get pixels pointer
+    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) */
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_n(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(2) / DEPTH_SIZE;
+
+    // Get pixels pointer
+    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/slice_ops.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/slice_ops.clembed
new file mode 100644
index 0000000..60944c9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/slice_ops.clembed

@@ -0,0 +1,678 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/QASYMM16/QSYMM16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+    int offset = 0;
+
+    // Offset X
+#if defined(SHRINK_0)
+    input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
+#if defined(SHRINK_1)
+    input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else  // defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
+    input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
+#if defined(SHRINK_2)
+    input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else  // defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
+    input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
+#if defined(SHRINK_3)
+    input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+    // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/sobel_filter.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/sobel_filter.clembed
new file mode 100644
index 0000000..48da2c5
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/sobel_filter.clembed

@@ -0,0 +1,1084 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/***********************************************/
+/*   Begin implementation of Sobel3x3 filter   */
+/***********************************************/
+
+/** This OpenCL kernel that computes a Sobel3x3 filter.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif /* GRAD_Y */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif /* GRAD_Y */
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    gy += left * (short8)(-1);
+    gy += middle * (short8)(-2);
+    gy += right * (short8)(-1);
+#endif /* GRAD_Y */
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-2);
+    gx += right * (short8)(+2);
+#endif /* GRAD_X */
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    gy += left * (short8)(+1);
+    gy += middle * (short8)(+2);
+    gy += right * (short8)(+1);
+#endif /* GRAD_Y */
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+/**********************************************/
+/*    End implementation of Sobel3x3 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel5x5 filter   */
+/***********************************************/
+
+/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src             Pointer to source image.
+ * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
+ * @param[in] left2_coeff_gx  Weight of the left pixel for gx
+ * @param[in] middle_coeff_gx Weight of the middle pixel for gx
+ * @param[in] right1_coeff_gx Weight of the right pixel for gx
+ * @param[in] right2_coeff_gx Weight of the most right pixel for gx
+ * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
+ * @param[in] left2_coeff_gy  Weight of the left pixel for gy
+ * @param[in] middle_coeff_gy Weight of the middle pixel for gy
+ * @param[in] right1_coeff_gy Weight of the right pixel for gy
+ * @param[in] right2_coeff_gy Weight of the most right pixel for gy
+ *
+ * @return a short16 containing short8 gx and short8 gy values.
+ */
+short16 sobel1x5(
+    Image      *src,
+    const short left1_coeff_gx,
+    const short left2_coeff_gx,
+    const short middle_coeff_gx,
+    const short right1_coeff_gx,
+    const short right2_coeff_gx,
+    const short left1_coeff_gy,
+    const short left2_coeff_gy,
+    const short middle_coeff_gy,
+    const short right1_coeff_gy,
+    const short right2_coeff_gy)
+{
+    uchar16 temp = vload16(0, offset(src, -2, 0));
+    short8  gx   = 0;
+    short8  gy   = 0;
+    short8  val;
+
+    val = convert_short8(temp.s01234567);
+    gx += val * (short8)left1_coeff_gx;
+    gy += val * (short8)left1_coeff_gy;
+
+    val = convert_short8(temp.s12345678);
+    gx += val * (short8)left2_coeff_gx;
+    gy += val * (short8)left2_coeff_gy;
+
+    val = convert_short8(temp.s23456789);
+    gx += val * (short8)middle_coeff_gx;
+    gy += val * (short8)middle_coeff_gy;
+
+    val = convert_short8(temp.s3456789a);
+    gx += val * (short8)right1_coeff_gx;
+    gy += val * (short8)right1_coeff_gy;
+
+    val = convert_short8(temp.s456789ab);
+    gx += val * (short8)right2_coeff_gx;
+    gy += val * (short8)right2_coeff_gy;
+
+    return (short16)(gx, gy);
+}
+
+/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+short8 sobel5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    short8 val;
+    short8 out = (short8)0;
+
+    val = vload8(0, (__global short *)offset(src, 0, -2));
+    out += val * (short8)up1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, -1));
+    out += val * (short8)up2_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 0));
+    out += val * (short8)middle_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 1));
+    out += val * (short8)down1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 2));
+    out += val * (short8)down2_coeff;
+
+    return (short8)(out);
+}
+
+/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x5(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif /* GRAD_Y */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+    // Output pixels
+    short16 gx_gy = sobel1x5(&src,
+                             -1, -2, 0, 2, 1,
+                             1, 4, 6, 4, 1);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
+ *  and output two single channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable5x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif /* GRAD_Y */
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+#ifdef GRAD_X
+    short8 gx = sobel5x1(&src_x,
+                         1, 4, 6, 4, 1);
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    short8 gy = sobel5x1(&src_y,
+                         -1, -2, 0, 2, 1);
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+/**********************************************/
+/*    End implementation of Sobel5x5 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel7x7 filter   */
+/***********************************************/
+
+/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
+#define X0 -1
+#define X1 -4
+#define X2 -5
+#define X3 0
+#define X4 5
+#define X5 4
+#define X6 1
+
+/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
+#define Y0 1
+#define Y1 6
+#define Y2 15
+#define Y3 20
+#define Y4 15
+#define Y5 6
+#define Y6 1
+
+/* Calculates single horizontal iteration. */
+#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
+    {                                                                \
+        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
+        gx += val * X##idx;                                          \
+        gy += val * Y##idx;                                          \
+    }
+
+/* Calculates single vertical iteration. */
+#define SOBEL1x1_VERT(src, g, direction, idx)                          \
+    {                                                                  \
+        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
+        g += val * (int8)direction##idx;                               \
+    }
+
+/* Calculates a 1x7 horizontal iteration. */
+#define SOBEL1x7(ptr, gx, gy)                        \
+    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
+    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
+    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
+    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
+    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
+    SOBEL1x1_HOR(ptr, gx, gy, 5) \
+    SOBEL1x1_HOR(ptr, gx, gy, 6)
+
+/* Calculates a 7x1 vertical iteration. */
+#define SOBEL7x1(ptr, g, direction)                         \
+    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
+    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
+    SOBEL1x1_VERT(ptr, g, direction, 2)             \
+    SOBEL1x1_VERT(ptr, g, direction, 3)         \
+    SOBEL1x1_VERT(ptr, g, direction, 4)     \
+    SOBEL1x1_VERT(ptr, g, direction, 5) \
+    SOBEL1x1_VERT(ptr, g, direction, 6)
+
+/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x7(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif /* GRAD_Y */
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+    int8 gx = (int8)0;
+    int8 gy = (int8)0;
+
+    SOBEL1x7(&src, gx, gy);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
+#endif /* GRAD_Y */
+}
+
+/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable7x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif /* GRAD_Y */
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif /* GRAD_Y */
+
+    // Output pixels
+#ifdef GRAD_X
+    int8 gx = 0;
+    SOBEL7x1(&src_x, gx, Y);
+    vstore8(gx, 0, (__global int *)dst_gx.ptr);
+#endif /* GRAD_X */
+#ifdef GRAD_Y
+    int8 gy = 0;
+    SOBEL7x1(&src_y, gy, X);
+    vstore8(gy, 0, (__global int *)dst_gy.ptr);
+#endif /* GRAD_Y */
+}
+
+/**********************************************/
+/*    End implementation of Sobel7x7 filter   */
+/**********************************************/
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer.clembed
new file mode 100644
index 0000000..8f2ecd1
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer.clembed

@@ -0,0 +1,1154 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+#define SUB_OP(x, y, type, size) ((x) - (y))
+#define MUL_OP(x, y, type, size) ((x) * (y))
+#define DIV_OP(x, y, type, size) ((x) / (y))
+#define EXP_OP(x, type, size) exp((x))
+
+#ifdef USE_F16
+#define MINVAL -HALF_MAX
+#define SELECT_DATA_TYPE short
+#else /* USE_F16 */
+#define MINVAL -FLT_MAX
+#define SELECT_DATA_TYPE int
+#endif /* USE_F16 */
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+/* Vector size, i.e. number of vector elements. */
+#if VECTOR_SIZE == 2
+__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
+__constant uint2 idx__ = (uint2)(0, 1);
+
+#elif VECTOR_SIZE == 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+
+#elif VECTOR_SIZE == 8
+__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+#endif /* VECTOR_SIZE END */
+
+// TODO (COMPMID-661): Remove if the non-fused kernels are removed
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant uint4 idx4   = (uint4)(0, 1, 2, 3);
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
+#ifdef LOG_SOFTMAX
+    vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+#else  /* LOG_SOFTMAX */
+    vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+#endif /* LOG_SOFTMAX */
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
+#endif /* BETA */
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
+
+    // Calculate max of row
+    const uint width_ = width >> LOG_VECTOR_SIZE;
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
+    widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+    // Store result
+    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+
+    /* Second section */
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+#else  /* LOG_SOFTMAX */
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+#endif /* LOG_SOFTMAX */
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+#endif /* LOG_SOFTMAX */
+    sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint lid = get_local_id(0);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+#endif /* BETA */
+
+    // Define one temporary vector per work-item.
+    __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = max_val_vec;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    sum1D             = 0;
+    DATA_TYPE max_val = max_local;
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+#else  /* LOG_SOFTMAX */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+#endif /* LOG_SOFTMAX */
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+#else  /* LOG_SOFTMAX */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+#endif /* LOG_SOFTMAX */
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+#endif /* LOG_SOFTMAX */
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        // Perform max reduction
+        sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+        sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+        *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+    }
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer_quantized.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer_quantized.clembed
new file mode 100644
index 0000000..9a6ce23
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/softmax_layer_quantized.clembed

@@ -0,0 +1,1639 @@
+R"(
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+#define SUB_OP(x, y, type, size) ((x) - (y))
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#if VECTOR_SIZE == 2
+__constant uint2 idx__ = (uint2)(0, 1);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
+
+#elif VECTOR_SIZE == 4
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
+
+#elif VECTOR_SIZE == 8
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
+
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
+
+#endif /* VECTOR_SIZE END */
+
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
+#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
+#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+#if defined(DIFF_MIN)
+
+VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
+int4 mult_by_quantized_multiplier_parallel(int4 data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
+
+    // Calculate max of row
+    const uint width4 = width >> LOG_VECTOR_SIZE;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec   = MAX_OP(data, max_val_vec, DATA_TYPE, 16);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    // Handle non multiple of 16
+    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
+    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
+    VEC_UCHAR widx       = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
+    max_val_vec          = MAX_OP(max_val_vec, select(vec_min_val, data, widx), DATA_TYPE, 16);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+
+    // Store result
+    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+
+    // Second part
+
+    // Load max value of 1D logits vector (row)
+    int max_val = convert_int(*((__global DATA_TYPE *)offset(&maxo, 0, 0)));
+
+    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
+    VEC_INT sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
+        data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+        data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    // Handle non multiple of 16
+    data                   = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
+    VEC_INT data_fp        = CONVERT(data, VEC_INT);
+    VEC_INT data_diff      = data_fp - max_val;
+    VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
+    data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+    data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+    VEC_INT widx_          = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
+    VSTORE(VECTOR_SIZE)
+    (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
+    data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    sum1D   = sum1D + select(MIN_VALUE, data_fp, widx_);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+
+    // Calculate and store result
+    *((__global int *)sum.ptr) = sum1D.s0;
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint4 idx4 = (uint4)(0, 1, 2, 3);
+    const uint  lid  = get_local_id(0);
+
+    // Define one temporary vector per work-item.
+    __local int4 tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    vec_min_val = (VEC_DATA_TYPE(DATA_TYPE, 4))(MIN_VALUE);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_val_vec = vec_min_val;
+
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max = vload4(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        widx        = CONVERT((((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width), VEC_DATA_TYPE(DATA_TYPE, 4));
+        max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data_max, widx), DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = convert_int4(max_val_vec);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(CONVERT((tmp_local[lid + 1]), VEC_DATA_TYPE(DATA_TYPE, 4)), CONVERT((tmp_local[lid]), VEC_DATA_TYPE(DATA_TYPE, 4)), DATA_TYPE, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    int4 sum1D   = 0;
+    int  max_val = convert_int(max_local);
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        int4 widx           = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
+        data_fp             = select(MIN_VALUE, data_fp, widx);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
+        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
+        // Perform max reduction
+        sum1D.s01                  = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
+        sum1D.s0                   = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
+        *((__global int *)sum.ptr) = sum1D.s0;
+    }
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+    // It will be better to calculate this in prev layer and pass here as parameter
+#ifndef LOG_SOFTMAX
+    uint  sum_val_u               = convert_uint(sum_val);
+    int   headroom_plus_one       = clz(sum_val_u);
+    int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+    int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+    int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
+    int16 shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
+#endif /* LOG_SOFTMAX */
+
+    // It was already calculated in prev layer, should be stored into tmp output and reused
+    int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
+    int16 data_diff_mult = data_diff;
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+
+#ifdef LOG_SOFTMAX
+    long16 data = SUB_OP(convert_long16(data_diff_mult), (long16)(sum_val), long, 16);
+    data        = select(0L, data, convert_long16(data_diff) >= (long16)(DIFF_MIN));
+#else /* LOG_SOFTMAX */
+    int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
+    data       = ASYMM_MULT(shifted_scale, data, 16);
+    data       = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
+#ifdef QASYMM8_SIGNED
+    data       = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
+#endif /* QASYMM8_SIGNED */
+    data       = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
+#endif /* LOG_SOFTMAX */
+    vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+}
+
+#endif /* defined(DIFF_MIN) */
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_batch.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_batch.clembed
new file mode 100644
index 0000000..ba99334
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_batch.clembed

@@ -0,0 +1,827 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KOUTD, EXPRESS OR
+ * IMPLIED, OUTCLUDOUTG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONOUTFROUTGEMENT. OUT NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER OUT AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISOUTG FROM,
+ * OUT OF OR OUT CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALOUTGS OUT THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_depth.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_depth.clembed
new file mode 100644
index 0000000..941c867
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/space_to_depth.clembed

@@ -0,0 +1,654 @@
+R"(
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/stack_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/stack_layer.clembed
new file mode 100644
index 0000000..10f17cf
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/stack_layer.clembed

@@ -0,0 +1,656 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    unsigned int idx_input)
+{
+    uint x_src = get_global_id(0);
+    uint y_src = get_global_id(1);
+    uint z_src = (get_global_id(2) % SRC_DIM2);
+    uint w_src = (get_global_id(2) / SRC_DIM2);
+
+    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+                                                     dst_stride_w * (uint)DST_DIM3);
+
+    *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/tablelookup.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/tablelookup.clembed
new file mode 100644
index 0000000..dbfdfaf
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/tablelookup.clembed

@@ -0,0 +1,657 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function performs table lookup on U8 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: U8
+ */
+__kernel void tablelookup_U8(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global uchar *lut)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    uchar8 data = vload8(0, src.ptr);
+
+    /* Load lut data */
+    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
+                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
+
+    /* Store result */
+    vstore8(lut_data, 0, dst.ptr);
+}
+
+/** This function performs table lookup on S16 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: S16
+ * @param[in]  offset                            LUT offset
+ * @param[in]  count                             Number of elements in the LUT
+ */
+__kernel void tablelookup_S16(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global short *lut,
+    uint            offset,
+    uint            count)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    short8 data = vload8(0, (__global short *)src.ptr);
+
+    /* Load output data */
+    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
+
+    /* Calculate index */
+    int8 index = convert_int8(data) + (int8)(offset);
+    int8 cond  = (index >= 0 && index < (int8)count);
+    index      = select(0, index, cond);
+
+    /* Load lut data */
+    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
+                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
+
+    /* Select output data depending on condition */
+    lut_data = select(out_data, lut_data, cond);
+
+    /* Store result */
+    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/threshold.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/threshold.clembed
new file mode 100644
index 0000000..26d1ec9
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/threshold.clembed

@@ -0,0 +1,647 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Perform binary thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  threshold                         The thresold value
+ */
+__kernel void threshold_binary(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar threshold)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform binary thresholding
+    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
+
+/** Perform range thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  lower                             Lower threshold
+ * @param[in]  upper                             Upper threshold
+ */
+__kernel void threshold_range(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar lower,
+    const uchar upper)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform range thresholding
+    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/tile.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/tile.clembed
new file mode 100644
index 0000000..48eb660
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/tile.clembed

@@ -0,0 +1,640 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+
+    // For all coordinates but x, each tile copies from the input
+    const int y     = get_global_id(1);
+    const int z     = get_global_id(2) % DST_DEPTH;
+    const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+    // If we are loading/storing multiple elements at time, we need to
+    // not exceed the input boundaries. The last threads need to backtrack
+    // of OFFSET elements. Those elements cumulates for previous tiles
+    const int id = (int)(get_global_id(0));
+    int       x  = id * VEC_SIZE;
+
+    // Shift x based on the previous offsets
+    const int tile_number = x / SRC_WIDTH;
+    x -= (tile_number) * OFFSET;
+    int x_input = x % SRC_WIDTH;
+
+    // Shift x based on being the last tile
+    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
+    x -= last_tile * OFFSET;
+    x_input = x % SRC_WIDTH;
+    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    // Copy the data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(OFFSET)
+    const int x = get_global_id(0);
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/transpose.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/transpose.clembed
new file mode 100644
index 0000000..68dec6c
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/transpose.clembed

@@ -0,0 +1,763 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define SWAP_ROW(u0, l0)     \
+    ({                       \
+        tmp_swap = u0;       \
+        u0       = l0;       \
+        l0       = tmp_swap; \
+    })
+
+#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
+    ({                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 4)              \
+        tmp_swap;                                \
+        SWAP_ROW(u0, l0);                        \
+        SWAP_ROW(u1, l1);                        \
+        SWAP_ROW(u2, l2);                        \
+        SWAP_ROW(u3, l3);                        \
+    })
+
+#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \
+    ({                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                              \
+        tmp_swap;                                                                \
+        SWAP_ROW(u0, l0);                                                        \
+        SWAP_ROW(u1, l1);                                                        \
+        SWAP_ROW(u2, l2);                                                        \
+        SWAP_ROW(u3, l3);                                                        \
+        SWAP_ROW(u4, l4);                                                        \
+        SWAP_ROW(u5, l5);                                                        \
+        SWAP_ROW(u6, l6);                                                        \
+        SWAP_ROW(u7, l7);                                                        \
+    })
+
+#define TRANSPOSE_4x4(u0, u1, u2, u3) \
+    ({                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)   \
+        tmp;                          \
+        tmp.s012 = u0.s123;           \
+        u0.s1    = u1.s0;             \
+        u0.s2    = u2.s0;             \
+        u0.s3    = u3.s0;             \
+        u1.s0    = tmp.s0;            \
+        u2.s0    = tmp.s1;            \
+        u3.s0    = tmp.s2;            \
+        \
+        tmp.s01 = u1.s23;             \
+        u1.s2   = u2.s1;              \
+        u1.s3   = u3.s1;              \
+        u2.s1   = tmp.s0;             \
+        u3.s1   = tmp.s1;             \
+        \
+        tmp.s0 = u2.s3;               \
+        u2.s3  = u3.s2;               \
+        u3.s2  = tmp.s0;              \
+    })
+
+#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)                                             \
+    ({                                                                                            \
+        TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123);                                    \
+        TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567);                                    \
+        TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123);                                    \
+        TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567);                                    \
+        SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \
+    })
+
+#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)                                                \
+    ({                                                                                                                                       \
+        TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567);       \
+        TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF);       \
+        TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \
+        TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \
+        SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF,             \
+                 u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567);      \
+    })
+
+#ifndef DATA_TYPE_IN_BYTES
+#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
+#endif /* not DATA_TYPE_IN_BYTES */
+
+#undef VLOAD
+#undef VSTORE
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3)
+#define VLOAD(x, y) vload4(x, y)
+#define VSTORE(x, y, z) vstore4(x, y, z)
+#define BLOCK_SIZE 4
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)
+#define VLOAD(x, y) vload8(x, y)
+#define VSTORE(x, y, z) vstore8(x, y, z)
+#define BLOCK_SIZE 8
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)
+#define VLOAD(x, y) vload16(x, y)
+#define VSTORE(x, y, z) vstore16(x, y, z)
+#define BLOCK_SIZE 16
+#else /* switch DATA_TYPE_IN_BYTES */
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif /* switch DATA_TYPE_IN_BYTES */
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(IMAGE_DECLARATION(src),
+                        IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0) * BLOCK_SIZE;
+    uint y = get_global_id(1) * BLOCK_SIZE;
+
+    // Compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Load the NxN block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+#if BLOCK_SIZE > 4
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7)));
+#if BLOCK_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15)));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+    // Transpose the block
+    TRANSPOSE();
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes;
+    VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y));
+    VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y));
+    VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y));
+    VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y));
+#if BLOCK_SIZE > 4
+    VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y));
+    VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y));
+    VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y));
+    VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y));
+#if BLOCK_SIZE == 16
+    VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y));
+    VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y));
+    VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y));
+    VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y));
+    VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y));
+    VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y));
+    VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y));
+    VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/types.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/types.hembed
new file mode 100644
index 0000000..b0cfc72
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/types.hembed

@@ -0,0 +1,60 @@
+R"(
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/upsample_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/upsample_layer.clembed
new file mode 100644
index 0000000..77d8417
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/upsample_layer.clembed

@@ -0,0 +1,678 @@
+R"(
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/warp_affine.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_affine.clembed
new file mode 100644
index 0000000..3d9905b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_affine.clembed

@@ -0,0 +1,1341 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Returns a vector of floats contaning the matrix coefficients. */
+inline const float8 build_affine_mtx()
+{
+    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
+}
+
+/** Transforms 4 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   affine matrix
+ *
+ * @return a int8 containing 4 2D transformed values.
+ */
+inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   output(x,y) = input(x0,y0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers.hembed
new file mode 100644
index 0000000..3920305
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers.hembed

@@ -0,0 +1,683 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers_quantized.hembed b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers_quantized.hembed
new file mode 100644
index 0000000..b7593ae
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_helpers_quantized.hembed

@@ -0,0 +1,1170 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+    }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+    {                                                                                                                                   \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
+        const VEC_DATA_TYPE(int, size)                                                                                                  \
+        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        mask = (one << exponent) - one;                                                                                                 \
+        VEC_DATA_TYPE(int, size)                                                                                                        \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+    }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size_quantized(coords, width, height, 1);
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords_quantized(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords_quantized()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] border_size   Border size
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
+                                                                                    const float scale, const int offset_qasymm)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const int16 t = (int16)(
+                        /* tl, tr, bl, br */
+                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+
+    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
+
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
+                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
+                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
+                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
+
+    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    return res;
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
+{
+    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/warp_perspective.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_perspective.clembed
new file mode 100644
index 0000000..45ccd35
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/warp_perspective.clembed

@@ -0,0 +1,1349 @@
+R"(
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Returns the perspective matrix */
+inline const float16 build_perspective_mtx()
+{
+    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
+}
+
+/** Transforms four 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   (x0/z0,y0/z0)
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   perspective matrix
+ *
+ * @return a vector float8 containing four 2D transformed values.
+ */
+inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [z,z+1,z+2,z+3]
+    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   output(x,y) = input(x0/z0,y0/z0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
+}
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_filter_transform.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_filter_transform.clembed
new file mode 100644
index 0000000..d075dcf
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_filter_transform.clembed

@@ -0,0 +1,2495 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if defined(SRC_DIM_Z)
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
+    ({                                                                                                                             \
+        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
+        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s7 = tmp.s6;                                                                                                           \
+    })
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 16 channels for 3x3 kernels
+    // 4 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    DATA_TYPE out00, out01, out02, out03, out04, out05;
+    out00 = (w00) / 16.f;
+    out01 = (-w00 - w01 - w02) / 24.f;
+    out02 = (-w00 + w01 - w02) / 24.f;
+    out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
+    out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
+    out05 = (w02) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    DATA_TYPE out10, out11, out12, out13, out14, out15;
+    out10 = (-w00 - w10 - w20) / 24.f;
+    out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
+    out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
+    out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out15 = (-w02 - w12 - w22) / 6.f;
+
+    // Row 2
+    DATA_TYPE out20, out21, out22, out23, out24, out25;
+    out20 = (-w00 + w10 - w20) / 24.f;
+    out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
+    out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
+    out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out25 = (-w02 + w12 - w22) / 6.f;
+
+    // Row 3
+    DATA_TYPE out30, out31, out32, out33, out34, out35;
+    out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
+    out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 4
+    DATA_TYPE out40, out41, out42, out43, out44, out45;
+    out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
+    out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 5
+    DATA_TYPE out50, out51, out52, out53, out54, out55;
+    out50 = (w20) / 4.f;
+    out51 = (-w20 - w21 - w22) / 6.f;
+    out52 = (-w20 + w21 - w22) / 6.f;
+    out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
+    out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
+    out55 = (w22);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6  channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Transform the input tile
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00;
+    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+    out0.s7 = w04;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40;
+    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+    out7.s7 = w44;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp = 0.0f;
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = 0.0f;
+
+    out0.s0 = -w00 / 36.0f;
+    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s7 = w06;
+
+    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1 = 0.0f;
+
+    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2 = 0.0f;
+
+    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3 = 0.0f;
+
+    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4 = 0.0f;
+
+    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5 = 0.0f;
+
+    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6 = 0.0f;
+
+    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7 = 0.0f;
+
+    tmp.s0 = w60;
+    tmp.s1 = w61;
+    tmp.s2 = w62;
+    tmp.s3 = w63;
+    tmp.s4 = w64;
+    tmp.s5 = w65;
+    tmp.s6 = w66;
+
+    OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_input_transform.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_input_transform.clembed
new file mode 100644
index 0000000..1d8de9b
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_input_transform.clembed

@@ -0,0 +1,3355 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z              = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
+
+    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
+
+    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    src_addr += src_stride_z;
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp4 = in_row4;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+    tmp4 -= in_row6;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp5 = in_row5 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp6 = in_row6 - in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp7 = in_row5 - in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out0 = 0.0f;
+    DATA_TYPE out1 = 0.0f;
+    DATA_TYPE out2 = 0.0f;
+    DATA_TYPE out3 = 0.0f;
+    DATA_TYPE out4 = 0.0f;
+    DATA_TYPE out5 = 0.0f;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
+    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
+    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
+    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
+    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
+    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+
+    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
+    DATA_TYPE k0 = d41.s0;
+    DATA_TYPE k1 = d41.s0;
+    DATA_TYPE k2 = d41.s0;
+    DATA_TYPE k3 = d41.s0;
+    DATA_TYPE k4 = d41.s0;
+    DATA_TYPE k5 = 0.0f;
+
+    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
+
+    // Row2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+
+    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
+    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
+    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
+    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
+    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
+    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+#if defined(SRC_DEPTH)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
+
+    // Row1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+
+    // Row3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *(dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *(dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int                                z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+    // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;
+    tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5 * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;
+    comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int z               = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
+
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
+    int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
+    y_coord0      = clamp(y_coord0, (int4) - 1, (int4)SRC_DIM_1);
+    y_coord1      = clamp(y_coord1, (int2) - 1, (int2)SRC_DIM_1);
+
+    int  z_coord;
+    int4 valid_y0;
+    int2 valid_y1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row4
+    z_coord = (z * 4) - (int)PAD_TOP + 4;
+
+    // If z < 0, set y to -1
+    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+    // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+
+    // Clamp z coordinate
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+
+    DATA_TYPE k0 = d44;
+    DATA_TYPE k1 = d44;
+    DATA_TYPE k2 = d44;
+    DATA_TYPE k3 = d44;
+    DATA_TYPE k4 = d44;
+    DATA_TYPE k5 = (DATA_TYPE)0.0f;
+
+    k0 += 4.0f * d40 - 5.0f * d42;
+    k1 += -4.0f * d41 - 4.0f * d42 + d43;
+    k2 += 4.0f * d41 - 4.0f * d42 - d43;
+    k3 += -2.0f * d41 + 2.0f * d43 - d42;
+    k4 += 2.0f * d41 - 2.0f * d43 - d42;
+    k5 += 4.0f * d41 - 5.0f * d43 + d45;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+
+#if PAD_TOP != 0
+    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+    valid_y0 = select(valid_y0, (int)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+#else  // PAD_TOP != 0
+    valid_y0 = y_coord0;
+    valid_y1 = y_coord1;
+#endif // if PAD_TOP == 0, we cannot read out of bound
+
+    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+#else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    int4            z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
+    int2            z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
+
+    valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);
+    valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, z_coords0 >= (int4)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, z_coords1 >= (int2)SRC_DIM_2);
+
+    z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
+    z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
+
+    DATA_TYPE d00                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+    DATA_TYPE d01                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+    DATA_TYPE d02                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+    DATA_TYPE d03                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+    DATA_TYPE d04                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+    DATA_TYPE d05                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
+    DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
+    DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
+    DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
+    DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
+    DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row2
+    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
+    out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;
+    out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;
+    out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;
+    out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;
+    out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+#if defined(NUM_TILES_Y)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(NUM_TILES_Y) */
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(NUM_TILES_Y) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *((__global DATA_TYPE *)dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row1
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+    // Row1 can never be out of bounds
+    valid_y0 = y_coord0;
+    valid_y1 = y_coord1;
+
+    DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row3
+    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    DATA_TYPE part0  = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
+    DATA_TYPE part1  = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
+    DATA_TYPE part2  = 16.0f * d22 - 4.0f * d24;
+    DATA_TYPE part3  = 16.0f * d21 - 4.0f * d23;
+    DATA_TYPE part4  = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
+    DATA_TYPE part5  = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
+    DATA_TYPE part6  = 4.0f * d22 - 4.0f * d24;
+    DATA_TYPE part7  = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
+    DATA_TYPE part8  = 4.0f * d12 - 4.0f * d14 - d32 + d34;
+    DATA_TYPE part9  = 8.0f * d21 - 8.0f * d23;
+    DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
+    DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24
+    DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
+    DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23
+    DATA_TYPE part18 = part6 * 0.25f; // d22 - d24
+    DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
+    DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
+    DATA_TYPE part21 = part9 * 0.25f;                                        // 2.0f * (d21 - d23)
+    DATA_TYPE part22 = part10 * 0.25f;                                       // - 4.0f * d21 + 5.0f * d23 - d25
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *((__global DATA_TYPE *)dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *((__global DATA_TYPE *)dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
+    out1 = -16.0f * d11 - 16.0f * d12 + 4.0f * d13 + 20.0f * d31 + 20.0f * d32 - 5.0f * d33 - 4.0f * d51 - 4.0f * d52 + d53 + d54 + 4.0f * d14 - 5.0f * d34;
+    out2 = 16.0f * d11 - 16.0f * d12 - 4.0f * d13 - 20.0f * d31 + 20.0f * d32 + 5.0f * d33 + 4.0f * d51 - 4.0f * d52 - d53 + d54 + 4.0f * d14 - 5.0f * d34;
+    out3 = -8.0f * d11 - 4.0f * d12 + 8.0f * d13 + 10.0f * d31 - 10.0f * d33 + 5.0f * d32 - 2.0f * d51 + 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
+    out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
+    out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
+
+    *((__global DATA_TYPE *)dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *((__global DATA_TYPE *)dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int                                z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
+
+    // Compute input address
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Row0
+    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
+    int z_coord = z * OUTPUT_TILE_H;
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
+    int y_coord = y * (int)OUTPUT_TILE_W;
+
+    // Row0
+    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
+    int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
+    int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0);         // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+#else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Row0
+    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
+
+    // Load the input tile
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row1
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row2
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row3
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row4
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row5
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row6
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    // Row7
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
+
+    // Calculate intermediate tensor and reuse common factor vectors
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+    comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0, out1, out2, out3, out4, out5, out6, out7;
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(NUM_TILES_Y)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* NUM_TILES_Y */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int       z        = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
+
+    // Compute input address
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Clamp coordinates. This clamp is valid for all columns
+    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
+    int y_coord = y * (int)OUTPUT_TILE_W;
+
+    // Row0
+    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
+    int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
+    int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0);         // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * (int)src_stride_z);
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+#else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Row0
+    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
+
+    // Load the input tile
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row1
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row2
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row3
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row4
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row5
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row6
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row7
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
+
+    // Calculate intermediate tensors
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(NUM_TILES_Y)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* NUM_TILES_Y */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_output_transform.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_output_transform.clembed
new file mode 100644
index 0000000..0e6997a
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/winograd_output_transform.clembed

@@ -0,0 +1,3377 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute the 2x1 or 1x2 output tile
+    // out00 = d00 + d01 + d02
+    // out01 = d01 - d02 - d03
+
+    float out00 = d00 + d01 + d02;
+    float out01 = d01 - d02 - d03;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    const const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
+    ({                                                               \
+        col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6;                   \
+        col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    int y_in  = get_global_id(1);
+    int x_out = get_global_id(0);
+    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
+    float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else                                                                         /* defined(SRC_DEPTH) */
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                        /* defined(SRC_DEPTH) */
+    offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x2 intermediate tensor
+    VEC_DATA_TYPE(float, 2)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
+
+    // Compute the 2x2 output tile
+    VEC_DATA_TYPE(float, 2)
+    out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
+    VEC_DATA_TYPE(float, 2)
+    out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 2))b;
+    out_col1 += (VEC_DATA_TYPE(float, 2))b;
+
+#endif // defined(HAS_BIAS)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset      = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+    int2 mult_y = min((int2)dst_size - offset, (int2)1);                           // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    // Store the output tile
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1)     = out_col0_dt.s1;
+    *(__global     DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src                                                                   = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr                                                       = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the 6x6 or 6x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (float)b;
+    out11 += (float)b;
+    out12 += (float)b;
+    out13 += (float)b;
+
+    out20 += (float)b;
+    out21 += (float)b;
+    out22 += (float)b;
+    out23 += (float)b;
+
+    out30 += (float)b;
+    out31 += (float)b;
+    out32 += (float)b;
+    out33 += (float)b;
+#endif // defined(HAS_BIAS)
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = d01 + d21 + d41 + d11 + d31;
+    float out01 = d01 + d21 + d41 + d11 + d31;
+    float out02 = d01 + d21 + d41 + d11 + d31;
+    float out03 = d01 + d21 + d41 + d11 + d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = get_global_id(0);
+    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+    out12 += (DATA_TYPE)b;
+    out13 += (DATA_TYPE)b;
+
+    out20 += (DATA_TYPE)b;
+    out21 += (DATA_TYPE)b;
+    out22 += (DATA_TYPE)b;
+    out23 += (DATA_TYPE)b;
+
+    out30 += (DATA_TYPE)b;
+    out31 += (DATA_TYPE)b;
+    out32 += (DATA_TYPE)b;
+    out33 += (DATA_TYPE)b;
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#endif // defined(HAS_BIAS)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else                                                                               /* defined(SRC_DEPTH) */
+    int4       offset                                            = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                              /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    // Store the 1x4 output tile
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
+#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+    // Store the 4x1 output tile
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+    int mult_y = min(dst_size - offset, 1);
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
+                         A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    // Store the 4x4 output tile
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
+                                                              VEC_DATA_TYPE(DATA_TYPE, 4)),
+                         A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
+
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
+    ({                                                                   \
+        comm_fact.s0 = d1 + d2;                                          \
+        comm_fact.s1 = d3 + d4;                                          \
+        comm_fact.s2 = d5 + d6;                                          \
+        \
+        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
+        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+        \
+        comm_fact.s0 = d1 - d2;                                          \
+        comm_fact.s1 = d3 - d4;                                          \
+        comm_fact.s2 = d5 - d6;                                          \
+        \
+        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute output address
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the 4x4 output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    int y_in  = get_global_id(1);
+    int x_out = get_global_id(0);
+    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else                                                                               /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                              /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                         B_VAL);
+    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    // Store the output tile
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/CL/cl_kernels/yolo_layer.clembed b/build/android-arm64v8a/src/core/CL/cl_kernels/yolo_layer.clembed
new file mode 100644
index 0000000..94c4c7a
--- /dev/null
+++ b/build/android-arm64v8a/src/core/CL/cl_kernels/yolo_layer.clembed

@@ -0,0 +1,787 @@
+R"(
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+
+#if VEC_SIZE != 1
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACTIVATION_TYPE=name. e.g. -DACTIVATION_TYPE=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nchw(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    const int  box_ch_id = get_global_id(2) % (NUM_CLASSES + 5);
+    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+    if(activate)
+    {
+        // Load data
+        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+        data      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, data, A_VAL, B_VAL); // select(1.0f, ACTIVATION_OP(ACTIVATION_TYPE, data), (SELECT_TYPE)activate);
+
+        // Store result
+        VSTORE(VEC_SIZE)
+        (data, 0, (__global DATA_TYPE *)output.ptr);
+    }
+#ifndef IN_PLACE
+    else
+    {
+        // Load data
+        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+        // Store result
+        VSTORE(VEC_SIZE)
+        (data, 0, (__global DATA_TYPE *)output.ptr);
+    }
+#endif // IN_PLACE
+}
+
+#else // VEC_SIZE != 1
+
+#define SELECT_TYPE SELECT_DATA_TYPE
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=1
+ * @note Activation function should be given as a preprocessor argument using -DACTIVATION_TYPE=name. e.g. -DACTIVATION_TYPE=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nhwc(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    const int  box_ch_id = get_global_id(0) % (NUM_CLASSES + 5);
+    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+    if(activate)
+    {
+        // Load data
+        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+        data           = select(data, ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, data, A_VAL, B_VAL), (SELECT_TYPE)activate);
+
+        // Store result
+        *((__global DATA_TYPE *)output.ptr) = data;
+    }
+#ifndef IN_PLACE
+    else
+    {
+        // Load data
+        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+
+        // Store result
+        *((__global DATA_TYPE *)output.ptr) = data;
+    }
+#endif // IN_PLACE
+}
+
+#endif // VEC_SIZE != 1
+#endif // defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+
+)"
\ No newline at end of file

diff --git a/build/android-arm64v8a/src/core/arm_compute_version.embed b/build/android-arm64v8a/src/core/arm_compute_version.embed
new file mode 100644
index 0000000..9bca003
--- /dev/null
+++ b/build/android-arm64v8a/src/core/arm_compute_version.embed

@@ -0,0 +1 @@
+"arm_compute_version=v20.02.1 Build options: {'os': 'android', 'build': 'embed_only', 'neon': '0', 'opencl': '1', 'embed_kernels': '1', 'validation_tests': '0', 'arch': 'arm64-v8a', 'build_dir': 'android-arm64v8a', 'benchmark_tests': '0'} Git hash=b'575c81f38edecaa662f3ee45d04ad8efded4fa81'"
\ No newline at end of file
commit	eb0170eb99a8d03384e87ff20a38f76ce16de252	[log] [tgz]
author	Kevin DuBois <kevindubois@google.com>	Thu Aug 06 09:13:53 2020 -0700
committer	Kevin DuBois <kevindubois@google.com>	Fri Aug 21 19:35:04 2020 -0700
tree	0d466fae5e5ab00e7bb66290fb909f3af9620196
parent	15cb0bc3d410187173f9f6179dca07725cfd1c08 [diff]