build/android-arm64v8a/src/core/CL/cl_kernels/hog.clembed - platform/external/ComputeLibrary - Git at Google

 R"(

 /*
  * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 /*
  * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H

 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)

 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

 #if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)

 #define GPU_ARCH_MIDGARD 0x100
 #define GPU_ARCH_BIFROST 0x200

 /** Concatenate two inputs.
  *
  * @param[in] a The first input to be concatenated
  * @param[in] b The second input to be concatenated
  *
  * @return The concatenated output
  */
 #define CONCAT(a, b) a##b

 /** Expand the given vector
  *
  * @param[in] x The vector to be expanded
  *
  * @return The expanded output
  */
 #define EXPAND(x) x

 /** Clamp the given value between an upper and lower bound.
  *
  * @param[in] x       The value to be clamped
  * @param[in] min_val The lower bound
  * @param[in] max_val The upper bound
  *
  * @return The clamped value.
  */
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)

 /** REVn reverses the given vector whose size is n.
  * @name REVn
  *
  * @param[in] x The vector to be reversed
  *
  * @return The reversed vector
  * @{
  */
 #define REV1(x) ((x))
 #define REV2(x) ((x).s10)
 #define REV3(x) ((x).s210)
 #define REV4(x) ((x).s3210)
 #define REV8(x) ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn

 /** Reverse the given vector.
  * @name REVERSE
  *
  * @param[in] x The vector to be reversed
  * @param[in] s The size of the vector
  *
  * @return The reversed vector
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
 #define REVERSE(x, s) REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE

 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
  * @name ROTs_n
  *
  * @param[in] x The vector to be shifted
  *
  * @return The shifted vector
  * @{
  */
 #define ROT1_0(x) ((x))

 #define ROT2_0(x) ((x))
 #define ROT2_1(x) ((x).s10)

 #define ROT3_0(x) ((x))
 #define ROT3_1(x) ((x).s201)
 #define ROT3_2(x) ((x).s120)

 #define ROT4_0(x) ((x))
 #define ROT4_1(x) ((x).s3012)
 #define ROT4_2(x) ((x).s2301)
 #define ROT4_3(x) ((x).s1230)

 #define ROT8_0(x) ((x))
 #define ROT8_1(x) ((x).s70123456)
 #define ROT8_2(x) ((x).s67012345)
 #define ROT8_3(x) ((x).s56701234)
 #define ROT8_4(x) ((x).s45670123)
 #define ROT8_5(x) ((x).s34567012)
 #define ROT8_6(x) ((x).s23456701)
 #define ROT8_7(x) ((x).s12345670)

 #define ROT16_0(x) ((x))
 #define ROT16_1(x) ((x).sF0123456789ABCDE)
 #define ROT16_2(x) ((x).sEF0123456789ABCD)
 #define ROT16_3(x) ((x).sDEF0123456789ABC)
 #define ROT16_4(x) ((x).sCDEF0123456789AB)
 #define ROT16_5(x) ((x).sBCDEF0123456789A)
 #define ROT16_6(x) ((x).sABCDEF0123456789)
 #define ROT16_7(x) ((x).s9ABCDEF012345678)
 #define ROT16_8(x) ((x).s89ABCDEF01234567)
 #define ROT16_9(x) ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
 #define ROT16_13(x) ((x).s3456789ABCDEF012)
 #define ROT16_14(x) ((x).s23456789ABCDEF01)
 #define ROT16_15(x) ((x).s123456789ABCDEF0)
 /** @} */ // end of group ROTs_n

 /** Circular-right-shift (rotate-right) the given vector by the given amount.
  * @name ROTATE
  *
  * @param[in] x The vector to be shifted
  * @param[in] s The size of the vector
  * @param[in] n The amount to be shifted
  *
  * @return The shifted vector
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
 #define ROTATE(x, s, n) ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE

 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
  * @name V_OFFSn
  *
  * @param[in] dt The data type of the output vector
  *
  * @return The vector filled with offset values
  * @{
  */
 #define V_OFFS1(dt) (dt)(0)
 #define V_OFFS2(dt) (dt)(0, 1)
 #define V_OFFS3(dt) (dt)(0, 1, 3)
 #define V_OFFS4(dt) (dt)(0, 1, 2, 3)
 #define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn

 /** Create a vector filled with offset values corresponding to the location of each element.
  * @name VEC_OFFS
  *
  * @param[in] dt The data type of the output vector
  * @param[in] s  The size of the output vector
  *
  * @return The vector filled with offset values
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
 #define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS

 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)

 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)

 #define float1 float
 #define half1 half
 #define char1 char
 #define uchar1 uchar
 #define short1 short
 #define ushort1 ushort
 #define int1 int
 #define uint1 uint
 #define long1 long
 #define ulong1 ulong
 #define double1 double

 #define vload1(OFFSET, PTR) *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA

 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
 #define convert_float_sat convert_float
 #define convert_float1_sat convert_float
 #define convert_float2_sat convert_float2
 #define convert_float3_sat convert_float3
 #define convert_float4_sat convert_float4
 #define convert_float8_sat convert_float8
 #define convert_float16_sat convert_float16
 #define convert_half_sat convert_float
 #define convert_half1_sat convert_half
 #define convert_half2_sat convert_half2
 #define convert_half3_sat convert_half3
 #define convert_half4_sat convert_half4
 #define convert_half8_sat convert_half8
 #define convert_half16_sat convert_half16

 #define convert_float1 convert_float
 #define convert_half1 convert_half
 #define convert_char1 convert_char
 #define convert_uchar1 convert_uchar
 #define convert_short1 convert_short
 #define convert_ushort1 convert_ushort
 #define convert_int1 convert_int
 #define convert_uint1 convert_uint
 #define convert_long1 convert_long
 #define convert_ulong1 convert_ulong
 #define convert_double1 convert_double

 #define convert_char1_sat convert_char_sat
 #define convert_uchar1_sat convert_uchar_sat
 #define convert_short1_sat convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
 #define convert_int1_sat convert_int_sat
 #define convert_uint1_sat convert_uint_sat
 #define convert_long1_sat convert_long_sat
 #define convert_ulong1_sat convert_ulong_sat
 #define convert_double1_sat convert_double_sat

 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)

 #define CL_VEC_DATA_TYPE_STR(type, size) type##size
 #define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)

 #define CONVERT_STR(x, type) (convert_##type((x)))
 #define CONVERT(x, type) CONVERT_STR(x, type)

 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
 #define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)

 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
 #define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)

 #define VECTOR_DECLARATION(name)     \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
     uint        name##_step_x,   \
     uint        name##_offset_first_element_in_bytes

 #define IMAGE_DECLARATION(name)      \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
     uint        name##_step_x,   \
     uint        name##_stride_y, \
     uint        name##_step_y,   \
     uint        name##_offset_first_element_in_bytes

 #define TENSOR3D_DECLARATION(name)   \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
     uint        name##_step_x,   \
     uint        name##_stride_y, \
     uint        name##_step_y,   \
     uint        name##_stride_z, \
     uint        name##_step_z,   \
     uint        name##_offset_first_element_in_bytes

 #define TENSOR4D_DECLARATION(name)   \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
     uint        name##_step_x,   \
     uint        name##_stride_y, \
     uint        name##_step_y,   \
     uint        name##_stride_z, \
     uint        name##_step_z,   \
     uint        name##_stride_w, \
     uint        name##_step_w,   \
     uint        name##_offset_first_element_in_bytes

 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)

 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)

 #define CONVERT_TO_IMAGE_STRUCT(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)

 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)

 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)

 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)

 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)

 #define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
                                  name##_stride_z, name##_step_z)

 #define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)

 #define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
                                  name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)

 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)

 /** Structure to hold Vector information */
 typedef struct Vector
 {
     __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
     int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
     int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
 } Vector;

 /** Structure to hold Image information */
 typedef struct Image
 {
     __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
     int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
     int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
     int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
 } Image;

 /** Structure to hold 3D tensor information */
 typedef struct Tensor3D
 {
     __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
     int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
     int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
     int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
     int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
 } Tensor3D;

 /** Structure to hold 4D tensor information */
 typedef struct Tensor4D
 {
     __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
     int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
     int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
     int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
     int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
     int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
 } Tensor4D;

 /** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
  * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
  * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
  * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
  *
  * @return An image object
  */
 inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
     Vector vector =
     {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
     };
     vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
     return vector;
 }

 /** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
  * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
  * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
  *
  * @return An image object
  */
 inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
     Image img =
     {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
         .stride_y                      = stride_y
     };
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }

 /** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
  * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
  * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
  * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
  *
  * @return A 3D tensor object
  */
 inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
 {
     Image img =
     {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
         .stride_y                      = stride_y
     };
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
     return img;
 }

 /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
  * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
  * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
  * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
  *
  * @return A 3D tensor object
  */
 inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
 {
     Tensor3D tensor =
     {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
         .stride_y                      = stride_y,
         .stride_z                      = stride_z
     };
     tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
     return tensor;
 }

 inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
                                              uint step_w,
                                              uint mod_size)
 {
     Tensor4D tensor =
     {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
         .stride_y                      = stride_y,
         .stride_z                      = stride_z,
         .stride_w                      = stride_w
     };

     tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
     return tensor;
 }

 /** Get the pointer position of a Vector
  *
  * @param[in] vec Pointer to the starting position of the buffer
  * @param[in] x   Relative X position
  */
 inline __global const uchar *vector_offset(const Vector *vec, int x)
 {
     return vec->ptr + x * vec->stride_x;
 }

 /** Get the pointer position of a Image
  *
  * @param[in] img Pointer to the starting position of the buffer
  * @param[in] x   Relative X position
  * @param[in] y   Relative Y position
  */
 inline __global uchar *offset(const Image *img, int x, int y)
 {
     return img->ptr + x * img->stride_x + y * img->stride_y;
 }

 /** Get the pointer position of a Tensor3D
  *
  * @param[in] tensor Pointer to the starting position of the buffer
  * @param[in] x      Relative X position
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
  */
 inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
 {
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 }

 /** Get the pointer position of a Tensor4D
  *
  * @param[in] tensor Pointer to the starting position of the buffer
  * @param[in] x      Relative X position
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
  * @param[in] w      Relative W position
  */
 inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
 {
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
 }

 #endif // _HELPER_H
 /*
  * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #ifndef ARM_COMPUTE_TYPES_H
 #define ARM_COMPUTE_TYPES_H

 /** 2D Coordinates structure */
 typedef struct Coordinates2D
 {
     int x; /**< The x coordinate. */
     int y; /**< The y coordinate. */
 } Coordinates2D;

 /* Keypoint struct */
 typedef struct Keypoint
 {
     int   x;               /**< The x coordinate. */
     int   y;               /**< The y coordinate. */
     float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
     float scale;           /**< Initialized to 0 by corner detectors. */
     float orientation;     /**< Initialized to 0 by corner detectors. */
     int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
     float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
 } Keypoint;

 /** Detection window struct */
 typedef struct DetectionWindow
 {
     ushort x;         /**< Top-left x coordinate */
     ushort y;         /**< Top-left y coordinate */
     ushort width;     /**< Width of the detection window */
     ushort height;    /**< Height of the detection window */
     ushort idx_class; /**< Index of the class */
     float  score;     /**< Confidence value for the detection window */
 } DetectionWindow;
 #endif // ARM_COMPUTE_TYPES_H

 #if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)

 /** This OpenCL kernel computes the HOG orientation binning
  *
  * @attention The following variables must be passed at compile time:
  *
  * -# -DCELL_WIDTH = Width of the cell
  * -# -DCELL_HEIGHT = height of the cell
  * -# -DNUM_BINS = Number of bins for each cell
  * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
  *
  * @note Each work-item computes a single cell
  *
  * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
  * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
  * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
  * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
  * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
  * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
  * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
  * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
  * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
  * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
  * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
  */
 __kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
                                       IMAGE_DECLARATION(phase),
                                       IMAGE_DECLARATION(dst))
 {
     float bins[NUM_BINS] = { 0 };

     // Compute address for the magnitude and phase images
     Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
     Image phase = CONVERT_TO_IMAGE_STRUCT(phase);

     __global uchar *mag_row_ptr   = mag.ptr;
     __global uchar *phase_row_ptr = phase.ptr;

     for(int yc = 0; yc < CELL_HEIGHT; ++yc)
     {
         int xc = 0;
         for(; xc <= (CELL_WIDTH - 4); xc += 4)
         {
             // Load magnitude and phase values
             const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
             float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));

             // Scale phase: phase * scale + 0.5f
             phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;

             // Compute histogram index.
             int4 hidx_s32 = convert_int4(phase_f32);

             // Compute magnitude weights (w0 and w1)
             const float4 hidx_f32 = convert_float4(hidx_s32);

             // w1 = phase_f32 - hidx_s32
             const float4 w1_f32 = phase_f32 - hidx_f32;

             // w0 = 1.0 - w1
             const float4 w0_f32 = (float4)1.0f - w1_f32;

             // Calculate the weights for splitting vote
             const float4 mag_w0_f32 = mag_f32 * w0_f32;
             const float4 mag_w1_f32 = mag_f32 * w1_f32;

             // Weighted vote between 2 bins

             // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
             hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));

             // Bin 0
             bins[hidx_s32.s0] += mag_w0_f32.s0;
             bins[hidx_s32.s1] += mag_w0_f32.s1;
             bins[hidx_s32.s2] += mag_w0_f32.s2;
             bins[hidx_s32.s3] += mag_w0_f32.s3;

             hidx_s32 += (int4)1;

             // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
             hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));

             // Bin1
             bins[hidx_s32.s0] += mag_w1_f32.s0;
             bins[hidx_s32.s1] += mag_w1_f32.s1;
             bins[hidx_s32.s2] += mag_w1_f32.s2;
             bins[hidx_s32.s3] += mag_w1_f32.s3;
         }

         // Left over computation
         for(; xc < CELL_WIDTH; xc++)
         {
             const float mag_value   = *((__global short *)mag_row_ptr + xc);
             const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
             const float w1          = phase_value - floor(phase_value);

             // The quantised phase is the histogram index [0, NUM_BINS - 1]
             // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
             const uint hidx = (uint)(phase_value) % NUM_BINS;

             // Weighted vote between 2 bins
             bins[hidx] += mag_value * (1.0f - w1);
             bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
         }

         // Point to the next row of magnitude and phase images
         mag_row_ptr += mag_stride_y;
         phase_row_ptr += phase_stride_y;
     }

     // Compute address for the destination image
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

     // Store the local HOG in the global memory
     int xc = 0;
     for(; xc <= (NUM_BINS - 4); xc += 4)
     {
         float4 values = vload4(0, bins + xc);

         vstore4(values, 0, ((__global float *)dst.ptr) + xc);
     }

     // Left over stores
     for(; xc < NUM_BINS; ++xc)
     {
         ((__global float *)dst.ptr)[xc] = bins[xc];
     }
 }
 #endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */

 #if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)

 #ifndef L2_NORM
 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
 #endif /* not L2_NORM */

 #ifndef L2HYS_NORM
 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
 #endif /* not L2HYS_NORM */

 #ifndef L1_NORM
 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
 #endif /* not L1_NORM */

 /** This OpenCL kernel computes the HOG block normalization
  *
  * @attention The following variables must be passed at compile time:
  *
  * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
  * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
  * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
  * -# -DHOG_NORM_TYPE = Normalization type
  * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
  * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
  * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
  * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
  *
  * @note Each work-item computes a single block
  *
  * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void hog_block_normalization(IMAGE_DECLARATION(src),
                                       IMAGE_DECLARATION(dst))
 {
     float  sum     = 0.0f;
     float4 sum_f32 = (float4)(0.0f);

     // Compute address for the source and destination tensor
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

     for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
     {
         const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);

         int xc = 0;
         for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
         {
             const float4 val0 = vload4(0, hist_ptr + xc + 0);
             const float4 val1 = vload4(0, hist_ptr + xc + 4);
             const float4 val2 = vload4(0, hist_ptr + xc + 8);
             const float4 val3 = vload4(0, hist_ptr + xc + 12);

 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
             // Compute val^2 for L2_NORM or L2HYS_NORM
             sum_f32 += val0 * val0;
             sum_f32 += val1 * val1;
             sum_f32 += val2 * val2;
             sum_f32 += val3 * val3;
 #else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
             // Compute |val| for L1_NORM
             sum_f32 += fabs(val0);
             sum_f32 += fabs(val1);
             sum_f32 += fabs(val2);
             sum_f32 += fabs(val3);
 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

             // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
             // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
             // will be accessed consecutively
             vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
             vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
             vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
             vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
         }

         // Compute left over
         for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
         {
             const float val = hist_ptr[xc];

 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
             sum += val * val;
 #else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
             sum += fabs(val);
 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

             ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
         }
     }

     sum += dot(sum_f32, (float4)1.0f);

     float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);

 #if(HOG_NORM_TYPE == L2HYS_NORM)
     // Reset sum
     sum_f32 = (float4)0.0f;
     sum     = 0.0f;

     int k = 0;
     for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
     {
         float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
         float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
         float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
         float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);

         // Scale val
         val0 = val0 * (float4)scale;
         val1 = val1 * (float4)scale;
         val2 = val2 * (float4)scale;
         val3 = val3 * (float4)scale;

         // Clip val if over _threshold_l2hys
         val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
         val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
         val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
         val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);

         // Compute val^2
         sum_f32 += val0 * val0;
         sum_f32 += val1 * val1;
         sum_f32 += val2 * val2;
         sum_f32 += val3 * val3;

         vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
         vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
         vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
         vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
     }

     // Compute left over
     for(; k < NUM_BINS_PER_BLOCK; ++k)
     {
         float val = ((__global float *)dst.ptr)[k] * scale;

         // Clip scaled input_value if over L2_HYST_THRESHOLD
         val = fmin(val, (float)L2_HYST_THRESHOLD);

         sum += val * val;

         ((__global float *)dst.ptr)[k] = val;
     }

     sum += dot(sum_f32, (float4)1.0f);

     // We use the same constants of OpenCV
     scale = 1.0f / (sqrt(sum) + 1e-3f);

 #endif /* (HOG_NORM_TYPE == L2HYS_NORM) */

     int i = 0;
     for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
     {
         float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
         float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
         float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
         float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);

         // Multiply val by the normalization scale factor
         val0 = val0 * (float4)scale;
         val1 = val1 * (float4)scale;
         val2 = val2 * (float4)scale;
         val3 = val3 * (float4)scale;

         vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
         vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
         vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
         vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
     }

     for(; i < NUM_BINS_PER_BLOCK; ++i)
     {
         ((__global float *)dst.ptr)[i] *= scale;
     }
 }
 #endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */

 #if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)

 /** This OpenCL kernel computes the HOG detector using linear SVM
  *
  * @attention The following variables must be passed at compile time:
  *
  * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
  * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
  * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
  * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
  * -# -DIDX_CLASS = Index of the class to detect
  * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
  * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
  * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
  * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
  *
  * @note Each work-item computes a single detection window
  *
  * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
  * @param[out] dst                               Pointer to DetectionWindow array
  * @param[out] num_detection_windows             Number of objects detected
  */
 __kernel void hog_detector(IMAGE_DECLARATION(src),
                            __global float *hog_descriptor,
                            __global DetectionWindow *dst,
                            __global uint *num_detection_windows)
 {
     // Check if the DetectionWindow array is full
     if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
     {
         return;
     }

     Image src = CONVERT_TO_IMAGE_STRUCT(src);

     const int src_step_y_f32 = src_stride_y / sizeof(float);

     // Init score_f32 with 0
     float4 score_f32 = (float4)0.0f;

     // Init score with 0
     float score = 0.0f;

     __global float *src_row_ptr = (__global float *)src.ptr;

     // Compute Linear SVM
     for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
     {
         int xb = 0;

         const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;

         for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
         {
             // Load descriptor values
             float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
             float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);

             float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
             float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);

             // Multiply accumulate
             score_f32 += a0_f32 * b0_f32;
             score_f32 += a1_f32 * b1_f32;
         }

         for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
         {
             const float a = src_row_ptr[xb];
             const float b = hog_descriptor[xb + offset_y];

             score += a * b;
         }
     }

     score += dot(score_f32, (float4)1.0f);

     // Add the bias. The bias is located at the position (descriptor_size() - 1)
     // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
     score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];

     if(score > (float)THRESHOLD)
     {
         int id = atomic_inc(num_detection_windows);
         if(id < MAX_NUM_DETECTION_WINDOWS)
         {
             dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
             dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
             dst[id].width     = DETECTION_WINDOW_WIDTH;
             dst[id].height    = DETECTION_WINDOW_HEIGHT;
             dst[id].idx_class = IDX_CLASS;
             dst[id].score     = score;
         }
     }
 }
 #endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
         * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */

 )"