Snap for 10453563 from 6b9ff13286194c7a2b38d624eeee38dc35987dc8 to mainline-sdkext-release

Change-Id: Ic2b4a6f8ef68589f8fe2d7e37d1695ca92dc5ae9
diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index ea01077..0000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-  - Visual Studio 2017
-shallow_clone: true
-  - Win32
-  - x64
-  - Release
-  matrix:
-   - SETARCH: i686
-   - SETARCH: x86_64
-  exclude:
-    - platform: Win32
-      SETARCH: x86_64
-    - platform: x64
-      SETARCH: i686
-  # Setup environment:
-  - ps: $env:TOP = $env:APPVEYOR_BUILD_FOLDER
-  - ps: $env:TOP
-  - echo %TOP%
-  # Get the OpenCL Headers:
-  - git clone --depth=1 OpenCL-Headers
-  # Get and build the OpenCL ICD Loader:
-  - git clone --depth=1
-  - ps: cd OpenCL-ICD-Loader
-  - ps: mkdir build
-  - ps: cd build
-  - cmake --build . --config %CONFIGURATION%
-  - ps: cd $env:TOP
-  # Get the libclcxx standard library:
-  - git clone --depth=1 libclcxx
-  # Generate the CTS solution file:
-  - cmake -DCL_INCLUDE_DIR=%TOP%/OpenCL-Headers
-          -DCL_LIB_DIR=%TOP%/OpenCL-ICD-Loader/build
-          -DCL_LIBCLCXX_DIR=%TOP%/libclcxx
-          -DOPENCL_LIBRARIES="OpenCL"
-          -H. -Bbuild_win -A%PLATFORM%
-  project: build_win\CLConform.sln
-  parallel: true
-  verbosity: normal
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 0c1778e..1dfdb96 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -3,37 +3,62 @@
-    name: Build ${{ matrix.os }} ${{ }}
+    name: Build ${{ matrix.os }} ${{ matrix.arch }}${{ matrix.extra }}
     runs-on: ${{ matrix.os }}
       JOB_ARCHITECTURE: ${{ matrix.arch }}
       JOB_ENABLE_GL: ${{ }}
+      JOB_ENABLE_DEBUG: ${{ matrix.debug }}
+      fail-fast: false
         mainmatrix: [true]
-        os: [ubuntu-20.04, macos-11.0]
+        os: [ubuntu-20.04, macos-latest, windows-latest]
           - os: ubuntu-20.04
             mainmatrix: true
             gl: 1
+            extra: " gl"
           - os: ubuntu-20.04
             mainmatrix: false
-            name: Arm
             arch: arm
           - os: ubuntu-20.04
             mainmatrix: false
-            name: AArch64
             arch: aarch64
+            debug: 1
+            extra: " debug"
       - uses: actions/checkout@v2
+      - name: Setup Ninja
+        uses: seanmiddleditch/gha-setup-ninja@master
+      - name: Setup OpenGL build dependencies
+        if: ${{ }}
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install libglu1-mesa-dev freeglut3-dev mesa-common-dev libglew-dev
+      - name: Setup MSVC with Ninja
+        uses: ilammy/msvc-dev-cmd@v1
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          variant: sccache
+          key: ${{ matrix.os }}-${{ matrix.arch }}
+      - name: Fetch OpenCL Headers
+        shell: bash
+        run: |
+          git clone
+          cd OpenCL-Headers
+          ln -s CL OpenCL # For OSX builds
+          cd ..
       - name: Build
+        shell: bash
         run: ./
     name: Check code format
     runs-on: ubuntu-20.04
       - name: Install packages
-        run: sudo apt install -y clang-format
+        run: sudo apt install -y clang-format clang-format-9
       - uses: actions/checkout@v2
           fetch-depth: 0
diff --git a/Android.bp b/Android.bp
index 37913dd..2c9a3b6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,24 +1,7 @@
 package {
     default_applicable_licenses: ["external_OpenCL-CTS_license"],
-// Added automatically by a large-scale-change that took the approach of
-// 'apply every license found to every target'. While this makes sure we respect
-// every license restriction, it may not be entirely correct.
-// e.g. GPL in an MIT project might only apply to the contrib/ directory.
-// Please consider splitting the single license below into multiple licenses,
-// taking care not to lose any license_kind information, and overriding the
-// default license using the 'licenses: [...]' property on targets as needed.
-// For unused files, consider creating a 'fileGroup' with "//visibility:private"
-// to attach the license to, and including a comment whether the files may be
-// used in the current project.
-// See: http://go/android-license-faq
 license {
     name: "external_OpenCL-CTS_license",
     visibility: [":__subpackages__"],
@@ -27,9 +10,6 @@
-        "legacy_by_exception_only", // by exception only
-        "legacy_proprietary", // by exception only
-        "legacy_unencumbered",
     license_text: [
@@ -40,8 +20,8 @@
     name: "ocl-harness-headers",
     export_include_dirs: [
-        "test_common"
-    ]
+        "test_common",
+    ],
 cc_defaults {
@@ -56,54 +36,36 @@
-        "-Wno-absolute-value",
-        "-Wno-asm-operand-widths",
-        "-Wno-dangling-else",
-        "-Wno-ignored-pragmas",
-        "-Wno-logical-op-parentheses",
-        "-Wno-macro-redefined",
-        "-Wno-missing-declarations",
-        "-Wno-parentheses",
-        "-Wno-parentheses-equality",
-        "-Wno-return-stack-address",
-        "-Wno-shift-negative-value",
-        "-Wno-switch",
-        "-Wno-unknown-pragmas",
-        "-Wno-unneeded-internal-declaration",
-        "-Wno-unused-function",
-        "-Wno-unused-label",
-        "-Wno-unused-variable",
-        "-Wno-writable-strings",
     static_libs: [
-        "ocl-stubs"
+        "ocl-stubs",
 cc_library {
     name: "ocl-harness",
-    srcs: [ "test_common/harness/*.cpp", ],
-    defaults: [ "ocl-harness-defaults" ],
+    srcs: ["test_common/harness/*.cpp"],
+    defaults: ["ocl-harness-defaults"],
 cc_defaults {
     name: "ocl-test-defaults",
-    defaults: [ "ocl-harness-defaults" ],
-    static_libs: [ "ocl-harness" ],
+    defaults: ["ocl-harness-defaults"],
+    static_libs: ["ocl-harness"],
     compile_multilib: "64",
     multilib: {
         lib64: {
@@ -114,398 +76,366 @@
 cc_defaults {
     name: "ocl-test-image-defaults",
-    srcs: [ "test_conformance/images/common.cpp" ],
-    export_include_dirs: [ "test_conformance/images" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/images/common.cpp"],
+    export_include_dirs: ["test_conformance/images"],
+    defaults: ["ocl-test-defaults"],
 cc_test {
     name: "ocl-test-allocations",
-    srcs: [ "test_conformance/allocations/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/allocations/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-api",
-    srcs: [ "test_conformance/api/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/api/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-atomics",
-    srcs: [ "test_conformance/atomics/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/atomics/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-basic",
-    srcs: [ "test_conformance/basic/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/basic/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-buffers",
-    srcs: [ "test_conformance/buffers/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/buffers/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-c11-atomics",
-    srcs: [ "test_conformance/c11_atomics/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/c11_atomics/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-commonfns",
-    srcs: [ "test_conformance/commonfns/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/commonfns/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-compiler",
-    srcs: [ "test_conformance/compiler/*.cpp" ],
-    data: [ "test_conformance/compiler/includeTestDirectory/testIncludeFile.h", "test_conformance/compiler/secondIncludeTestDirectory/testIncludeFile.h" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/compiler/*.cpp"],
+    data: [
+        "test_conformance/compiler/includeTestDirectory/testIncludeFile.h",
+        "test_conformance/compiler/secondIncludeTestDirectory/testIncludeFile.h",
+    ],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-computeinfo",
-    srcs: [ "test_conformance/computeinfo/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/computeinfo/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-contractions",
-    srcs: [ "test_conformance/contractions/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/contractions/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-conversions",
-    srcs: [ "test_conformance/conversions/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/conversions/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-device-execution",
-    srcs: [ "test_conformance/device_execution/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/device_execution/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-device-partition",
-    srcs: [ "test_conformance/device_partition/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/device_partition/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-device-timer",
-    srcs: [ "test_conformance/device_timer/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/device_timer/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-events",
-    srcs: [ "test_conformance/events/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/events/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-generic-address-space",
-    srcs: [ "test_conformance/generic_address_space/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/generic_address_space/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-geometrics",
-    srcs: [ "test_conformance/geometrics/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/geometrics/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-half",
-    srcs: [ "test_conformance/half/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/half/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-integer-ops",
-    srcs: [ "test_conformance/integer_ops/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/integer_ops/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-math-brute-force",
-    srcs: [ "test_conformance/math_brute_force/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/math_brute_force/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-mem-host-flags",
-    srcs: [ "test_conformance/mem_host_flags/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/mem_host_flags/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-multiple-device-context",
-    srcs: [ "test_conformance/multiple_device_context/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/multiple_device_context/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-non-uniform-work-group",
-    srcs: [ "test_conformance/non_uniform_work_group/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/non_uniform_work_group/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-pipes",
-    srcs: [ "test_conformance/pipes/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/pipes/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-printf",
-    srcs: [ "test_conformance/printf/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/printf/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-profiling",
-    srcs: [ "test_conformance/profiling/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/profiling/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-relationals",
-    srcs: [ "test_conformance/relationals/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/relationals/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-select",
-    srcs: [ "test_conformance/select/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/select/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-spir",
-    srcs: [ "test_conformance/spir/*.cpp", "test_conformance/math_brute_force/function_list.cpp", "test_common/miniz/miniz.c" ],
-    data: [ "test_conformance/spir/*.zip" ],
-    cflags: [ "-DFUNCTION_LIST_ULPS_ONLY", "-Wno-unused-private-field" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: [
+        "test_conformance/spir/*.cpp",
+        "test_conformance/math_brute_force/function_list.cpp",
+        "test_common/miniz/miniz.c",
+    ],
+    data: ["test_conformance/spir/*.zip"],
+    cflags: [
+        "-Wno-unused-private-field",
+    ],
+    defaults: ["ocl-test-defaults"],
     rtti: true,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-spirv-new",
-    srcs: [ "test_conformance/spirv_new/*.cpp", "test_conformance/math_brute_force/reference_math.cpp", "test_conformance/math_brute_force/utility.cpp" ],
-    data: [ "test_conformance/spirv_new/spirv_asm/*", "test_conformance/spirv_new/spirv_bin/*" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: [
+        "test_conformance/spirv_new/*.cpp",
+        "test_conformance/math_brute_force/reference_math.cpp",
+        "test_conformance/math_brute_force/utility.cpp",
+    ],
+    data: [
+        "test_conformance/spirv_new/spirv_asm/*",
+        "test_conformance/spirv_new/spirv_bin/*",
+    ],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-subgroups",
-    srcs: [ "test_conformance/subgroups/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/subgroups/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-svm",
-    srcs: [ "test_conformance/SVM/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/SVM/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-thread-dimensions",
-    srcs: [ "test_conformance/thread_dimensions/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/thread_dimensions/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-vectors",
-    srcs: [ "test_conformance/vectors/*.cpp" ],
-    defaults: [ "ocl-test-defaults" ],
+    srcs: ["test_conformance/vectors/*.cpp"],
+    defaults: ["ocl-test-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-clcopyimage",
-    srcs: [ "test_conformance/images/clCopyImage/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/clCopyImage/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-clfillimage",
-    srcs: [ "test_conformance/images/clFillImage/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/clFillImage/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-clgetinfo",
-    srcs: [ "test_conformance/images/clGetInfo/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/clGetInfo/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-clreadwriteimage",
-    srcs: [ "test_conformance/images/clReadWriteImage/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/clReadWriteImage/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-kernel-image-methods",
-    srcs: [ "test_conformance/images/kernel_image_methods/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/kernel_image_methods/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-kernel-read-write",
-    srcs: [ "test_conformance/images/kernel_read_write/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/kernel_read_write/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 cc_test {
     name: "ocl-test-image-samplerlessreads",
-    srcs: [ "test_conformance/images/samplerlessReads/*.cpp" ],
-    defaults: [ "ocl-test-image-defaults" ],
+    srcs: ["test_conformance/images/samplerlessReads/*.cpp"],
+    defaults: ["ocl-test-image-defaults"],
     rtti: false,
-    gtest: false
+    gtest: false,
 python_test_host {
     name: "opencl_cts",
     main: "scripts/",
-    srcs: [ "scripts/" ],
-    data: [ "scripts/test_opencl_cts.xml" ],
+    srcs: ["scripts/"],
+    data: ["scripts/test_opencl_cts.xml"],
     test_config: "scripts/test_opencl_cts.xml",
-    version: {
-        py2: {
-            enabled: false,
-        },
-        py3: {
-            enabled: true
-        }
-    },
     test_options: {
         unit_test: false,
@@ -514,15 +444,5 @@
 python_test {
     name: "run_conformance",
     main: "test_conformance/",
-    srcs: [ "test_conformance/" ],
-    version: {
-        py2: {
-            enabled: true,
-            embedded_launcher: true,
-        },
-        py3: {
-            enabled: false,
-       }
-    },
+    srcs: ["test_conformance/"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 083ea96..6a25d5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,12 +10,6 @@
-    set (BUILD_FLAVOR "release")
-    set (BUILD_FLAVOR "debug")
@@ -29,14 +23,6 @@
-# Support both VS2008 and VS2012.
-  set(VS_BUILD_DIR "${BUILD_DIR}/vs2008")
-  set(VS_BUILD_DIR "${BUILD_DIR}/vs2012")
 # Default Configurable Test Set
@@ -102,14 +88,14 @@
+    add_cxx_flag_if_supported(-Wmisleading-indentation)
+    add_cxx_flag_if_supported(-Wunused-variable)
     add_cxx_flag_if_supported(-Wno-error=cpp) # Allow #warning directive
-    add_cxx_flag_if_supported(-Wno-error=absolute-value) # Issue 783
     add_cxx_flag_if_supported(-Wno-error=unknown-pragmas) # Issue #785
     add_cxx_flag_if_supported(-Wno-error=asm-operand-widths) # Issue #784
-    add_cxx_flag_if_supported(-Wno-error=overflow) # Fixed by #699
     # -msse -mfpmath=sse to force gcc to use sse for float math,
     # avoiding excess precision problems that cause tests like int2float
@@ -127,9 +113,24 @@
+# Set a module's COMPILE_FLAGS if using gcc or clang.
+macro(set_gnulike_module_compile_flags flags)
+            ${${MODULE_NAME}_SOURCES}
+            PROPERTIES
+            COMPILE_FLAGS ${flags}
+        )
+    endif()
     # Don't warn when using standard non-secure functions.
+    # Don't warn about using the portable "strdup" function.
+    add_compile_definitions(_CRT_NONSTDC_NO_DEPRECATE)
+    # Fix std::min and std::max handling with windows.harness.
+    add_compile_definitions(NOMINMAX)
@@ -152,10 +153,6 @@
     list(APPEND CLConform_LIBRARIES pthread)
     find_library(corefoundation CoreFoundation)
     find_library(iokit IOKit)
@@ -169,38 +166,5 @@
-    set (BUILD_FLAVOR "release")
-    set (BUILD_FLAVOR "debug")
-# Support both VS2008 and VS2012.
-set (DLL_FILES "${VS_BUILD_DIR}/Debug/*.dll")
-if (WIN32)
-    set (COPY "echo")
-    add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX} ALL
-                      COMMAND ${COPY} "${DLL_FILES}" "${DST_DIR}"
-                      COMMENT "Copying dll files.. ")
-else (WIN32)
-    set (COPY cp)
-    add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX})
-                     COMMAND ${COPY} ${DLL_FILES} ${DST_DIR}
-                     COMMENT "Copying other files to output folder..." )
-  add_custom_target( COPY_FILES${CONFORMANCE_SUFFIX} )
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..85de3d4
--- /dev/null
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/METADATA b/METADATA
index 1eaf99d..235bc5a 100644
@@ -1,7 +1,19 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/ update OpenCL-CTS
+# For more info, check
+name: "OpenCL-CTS"
+description: "OpenCL Conformance Tests"
 third_party {
-  license_note: "Khronos proprietary"
-  license_type: BY_EXCEPTION_ONLY
+  url {
+    type: GIT
+    value: ""
+  }
+  version: "90a5183ec499d5b4701f58f6134dd424d82c4dca"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2022
+    month: 10
+    day: 26
+  }
new file mode 100644
index 0000000..e69de29
--- /dev/null
diff --git a/ b/
index b2d825f..3d41064 100644
--- a/
+++ b/
@@ -1,2 +1,115 @@
-# OpenCL-CTS [![Build Status](](
-The OpenCL Conformance Tests
+# OpenCL Conformance Test Suite (CTS)
+This it the OpenCL CTS for all versions of the Khronos
+[OpenCL]( standard.
+## Building the CTS
+The CTS supports Linux, Windows, macOS, and Android platforms. In particular,
+GitHub Actions CI builds against Ubuntu 20.04, Windows-latest, and
+Compiling the CTS requires the following CMake configuration options to be set:
+* `CL_INCLUDE_DIR` Points to the unified
+  [OpenCL-Headers](
+* `CL_LIB_DIR` Directory containing the OpenCL library to build against.
+* `OPENCL_LIBRARIES` Name of the OpenCL library to link.
+It is advised that the [OpenCL ICD-Loader](
+is used as the OpenCL library to build against. Where `CL_LIB_DIR` points to a
+build of the ICD loader and `OPENCL_LIBRARIES` is "OpenCL".
+### Example Build
+Steps on a Linux platform to clone dependencies from GitHub sources, configure
+a build, and compile.
+git clone
+git clone
+git clone
+mkdir OpenCL-ICD-Loader/build
+cmake -S OpenCL-ICD-Loader -B OpenCL-ICD-Loader/build \
+cmake --build ./OpenCL-ICD-Loader/build --config Release
+mkdir OpenCL-CTS/build
+cmake -S OpenCL-CTS -B OpenCL-CTS/build \
+      -DCL_INCLUDE_DIR=$PWD/OpenCL-Headers \
+      -DCL_LIB_DIR=$PWD/OpenCL-ICD-Loader/build \
+cmake --build OpenCL-CTS/build --config Release
+## Running the CTS
+A build of the CTS contains multiple executables representing the directories in
+the `test_conformance` folder. Each of these executables contains sub-tests, and
+possibly smaller granularities of testing within the sub-tests.
+See the `--help` output on each executable for the list of sub-tests available,
+as well as other options for configuring execution.
+If the OpenCL library built against is the ICD Loader, and the vendor library to
+be tested is not registered in the
+[default ICD Loader location](
+then the [OCL_ICD_FILENAMES](
+environment variable will need to be set for the ICD Loader to detect the OpenCL
+library to use at runtime. For example, to run the basic tests on a Linux
+OCL_ICD_FILENAMES=/path/to/ ./test_basic
+### Offline Compilation
+Testing OpenCL drivers which do not have a runtime compiler can be done by using
+additional command line arguments provided by the test harness for tests which
+require compilation, these are:
+* `--compilation-mode` Selects if OpenCL-C source code should be compiled using
+  an external tool before being passed on to the OpenCL driver in that form for
+  testing. Online is the default mode, but also accepts the values `spir-v`, and
+  `binary`.
+* `--compilation-cache-mode` Controls how the compiled OpenCL-C source code
+  should be cached on disk.
+* `--compilation-cache-path` Accepts a path to a directory where the compiled
+  binary cache should be stored on disk.
+* `--compilation-program` Accepts a path to an executable (default:
+   cl_offline_compiler) invoked by the test harness to perform offline
+   compilation of OpenCL-C source code.  This executable must match the
+   [interface description](test_common/harness/cl_offline_compiler-interface.txt).
+## Generating a Conformance Report
+The Khronos [Conformance Process Document](
+details the steps required for a conformance submissions.
+In this repository [opencl_conformance_tests_full.csv](test_conformance/submission_details_template.txt)
+defines the full list of tests which must be run for conformance. The output log
+of which must be included alongside a filled in
+[submission details template](test_conformance/submission_details_template.txt).
+Utility script [](test_conformance/ can be
+used to help generating the submission log, although it is not required.
+Git [tags]( are used to define
+the version of the repository conformance submissions are made against.
+## Contributing
+Contributions are welcome to the project from Khronos members and non-members
+alike via GitHub Pull Requests (PR). Alternatively, if you've found a bug or have
+a questions please file an issue in the GitHub project. First time contributors
+will be required to sign the Khronos Contributor License Agreement (CLA) before
+their PR can be merged.
+PRs to the repository are required to be `clang-format` clean to pass CI.
+Developers can either use the `git-clang-format` tool locally to verify this
+before contributing, or update their PR based on the diff provided by a failing
+CI job.
diff --git a/ b/
index 7de2bd2..be8f9d7 100755
--- a/
+++ b/
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
-# Arg used to specify non-'origin/master' comparison branch
+# Arg used to specify non-'origin/main' comparison branch
 CLANG_BINARY=${2:-"`which clang-format-9`"}
 # Run git-clang-format to check for violations
diff --git a/dependencies/Android.bp b/dependencies/Android.bp
index a8dbeee..e521ca8 100644
--- a/dependencies/Android.bp
+++ b/dependencies/Android.bp
@@ -33,7 +33,6 @@
-        "ocl-headers/CL/cl_gl_ext.h",
     cmd: "python3 $(location) $(in) > $(out)"
diff --git a/dependencies/ocl-headers/CL/cl.h b/dependencies/ocl-headers/CL/cl.h
index 0018a0f..6c700ab 100644
--- a/dependencies/ocl-headers/CL/cl.h
+++ b/dependencies/ocl-headers/CL/cl.h
@@ -141,6 +141,10 @@
 #pragma warning( push )
 #pragma warning( disable : 4201 )   /* Prevents warning about nameless struct/union in /W4 builds */
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc11-extensions" /* Prevents warning about nameless union being C11 extension*/
 #if defined(_MSC_VER) && defined(__STDC__)
     /* Anonymous unions are not supported in /Za builds */
@@ -158,6 +162,9 @@
 #if defined(_MSC_VER) && !defined(__STDC__)
 #pragma warning( pop )
+#ifdef __clang__
+#pragma clang diagnostic pop
 } cl_image_desc;
diff --git a/dependencies/ocl-headers/CL/cl_ext.h b/dependencies/ocl-headers/CL/cl_ext.h
index 80a81de..3eba7ed 100644
--- a/dependencies/ocl-headers/CL/cl_ext.h
+++ b/dependencies/ocl-headers/CL/cl_ext.h
@@ -26,6 +26,494 @@
 #include <CL/cl.h>
+* cl_khr_command_buffer
+#define cl_khr_command_buffer 1
+    "cl_khr_command_buffer"
+typedef cl_bitfield         cl_device_command_buffer_capabilities_khr;
+typedef struct _cl_command_buffer_khr* cl_command_buffer_khr;
+typedef cl_uint             cl_sync_point_khr;
+typedef cl_uint             cl_command_buffer_info_khr;
+typedef cl_uint             cl_command_buffer_state_khr;
+typedef cl_properties       cl_command_buffer_properties_khr;
+typedef cl_bitfield         cl_command_buffer_flags_khr;
+typedef cl_properties       cl_ndrange_kernel_command_properties_khr;
+typedef struct _cl_mutable_command_khr* cl_mutable_command_khr;
+/* cl_device_info */
+/* cl_device_command_buffer_capabilities_khr - bitfield */
+/* cl_command_buffer_properties_khr */
+#define CL_COMMAND_BUFFER_FLAGS_KHR                         0x1293
+/* cl_command_buffer_flags_khr */
+#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR              (1 << 0)
+/* Error codes */
+#define CL_INVALID_COMMAND_BUFFER_KHR                       -1138
+#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR                 -1139
+#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR                   -1140
+/* cl_command_buffer_info_khr */
+#define CL_COMMAND_BUFFER_QUEUES_KHR                        0x1294
+#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR                    0x1295
+#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR               0x1296
+#define CL_COMMAND_BUFFER_STATE_KHR                         0x1297
+#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR              0x1298
+/* cl_command_buffer_state_khr */
+#define CL_COMMAND_BUFFER_STATE_PENDING_KHR                 2
+#define CL_COMMAND_BUFFER_STATE_INVALID_KHR                 3
+/* cl_command_type */
+#define CL_COMMAND_COMMAND_BUFFER_KHR                       0x12A8
+typedef cl_command_buffer_khr (CL_API_CALL *
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    const cl_command_buffer_properties_khr* properties,
+    cl_int* errcode_ret) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer) ;
+typedef cl_int (CL_API_CALL *
+    cl_uint num_queues,
+    cl_command_queue* queues,
+    cl_command_buffer_khr command_buffer,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    size_t src_offset,
+    size_t dst_offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_image,
+    size_t src_offset,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* region,
+    size_t dst_offset,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    const void* pattern,
+    size_t pattern_size,
+    size_t offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem image,
+    const void* fill_color,
+    const size_t* origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    const cl_ndrange_kernel_command_properties_khr* properties,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    const size_t* local_work_size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    cl_command_buffer_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    const cl_command_buffer_properties_khr* properties,
+    cl_int* errcode_ret) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_uint num_queues,
+    cl_command_queue* queues,
+    cl_command_buffer_khr command_buffer,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    size_t src_offset,
+    size_t dst_offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_image,
+    size_t src_offset,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* region,
+    size_t dst_offset,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    const void* pattern,
+    size_t pattern_size,
+    size_t offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem image,
+    const void* fill_color,
+    const size_t* origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    const cl_ndrange_kernel_command_properties_khr* properties,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    const size_t* local_work_size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    cl_command_buffer_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+#endif /* CL_NO_PROTOTYPES */
+* cl_khr_command_buffer_mutable_dispatch
+#define cl_khr_command_buffer_mutable_dispatch 1
+    "cl_khr_command_buffer_mutable_dispatch"
+typedef cl_uint             cl_command_buffer_structure_type_khr;
+typedef cl_bitfield         cl_mutable_dispatch_fields_khr;
+typedef cl_uint             cl_mutable_command_info_khr;
+typedef struct _cl_mutable_dispatch_arg_khr {
+    cl_uint arg_index;
+    size_t arg_size;
+    const void* arg_value;
+} cl_mutable_dispatch_arg_khr;
+typedef struct _cl_mutable_dispatch_exec_info_khr {
+    cl_uint param_name;
+    size_t param_value_size;
+    const void* param_value;
+} cl_mutable_dispatch_exec_info_khr;
+typedef struct _cl_mutable_dispatch_config_khr {
+    cl_command_buffer_structure_type_khr type;
+    const void* next;
+    cl_mutable_command_khr command;
+    cl_uint num_args;
+    cl_uint num_svm_args;
+    cl_uint num_exec_infos;
+    cl_uint work_dim;
+    const cl_mutable_dispatch_arg_khr* arg_list;
+    const cl_mutable_dispatch_arg_khr* arg_svm_list;
+    const cl_mutable_dispatch_exec_info_khr* exec_info_list;
+    const size_t* global_work_offset;
+    const size_t* global_work_size;
+    const size_t* local_work_size;
+} cl_mutable_dispatch_config_khr;
+typedef struct _cl_mutable_base_config_khr {
+    cl_command_buffer_structure_type_khr type;
+    const void* next;
+    cl_uint num_mutable_dispatch;
+    const cl_mutable_dispatch_config_khr* mutable_dispatch_list;
+} cl_mutable_base_config_khr;
+/* cl_command_buffer_flags_khr - bitfield */
+#define CL_COMMAND_BUFFER_MUTABLE_KHR                       (1 << 1)
+/* Error codes */
+#define CL_INVALID_MUTABLE_COMMAND_KHR                      -1141
+/* cl_device_info */
+/* cl_ndrange_kernel_command_properties_khr */
+/* cl_mutable_dispatch_fields_khr - bitfield */
+#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR               (1 << 0)
+#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR                 (1 << 1)
+#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR                  (1 << 2)
+#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR                   (1 << 3)
+#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR                   (1 << 4)
+/* cl_mutable_command_info_khr */
+#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR                0x12A0
+#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR               0x12A1
+#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR                 0x12AD
+#define CL_MUTABLE_DISPATCH_KERNEL_KHR                      0x12A3
+#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR                  0x12A4
+/* cl_command_buffer_structure_type_khr */
+typedef cl_int (CL_API_CALL *
+    cl_command_buffer_khr command_buffer,
+    const cl_mutable_base_config_khr* mutable_config) ;
+typedef cl_int (CL_API_CALL *
+    cl_mutable_command_khr command,
+    cl_mutable_command_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_buffer_khr command_buffer,
+    const cl_mutable_base_config_khr* mutable_config) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_mutable_command_khr command,
+    cl_mutable_command_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+#endif /* CL_NO_PROTOTYPES */
 /* cl_khr_fp64 extension - no extension #define since it has no functions  */
 /* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
@@ -734,6 +1222,321 @@
     size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;
+* cl_khr_integer_dot_product
+#define cl_khr_integer_dot_product 1
+typedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;
+/* cl_device_integer_dot_product_capabilities_khr */
+typedef struct _cl_device_integer_dot_product_acceleration_properties_khr {
+    cl_bool signed_accelerated;
+    cl_bool unsigned_accelerated;
+    cl_bool mixed_signedness_accelerated;
+    cl_bool accumulating_saturating_signed_accelerated;
+    cl_bool accumulating_saturating_unsigned_accelerated;
+    cl_bool accumulating_saturating_mixed_signedness_accelerated;
+} cl_device_integer_dot_product_acceleration_properties_khr;
+/* cl_device_info */
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR                          0x1073
+* cl_khr_external_memory
+#define cl_khr_external_memory 1
+typedef cl_uint             cl_external_memory_handle_type_khr;
+/* cl_platform_info */
+/* cl_device_info */
+/* cl_mem_properties */
+#define CL_DEVICE_HANDLE_LIST_KHR                           0x2051
+#define CL_DEVICE_HANDLE_LIST_END_KHR                       0
+/* cl_command_type */
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
+* cl_khr_external_memory_dma_buf
+#define cl_khr_external_memory_dma_buf 1
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR               0x2067
+* cl_khr_external_memory_dx
+#define cl_khr_external_memory_dx 1
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR            0x2065
+* cl_khr_external_memory_opaque_fd
+#define cl_khr_external_memory_opaque_fd 1
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR             0x2060
+* cl_khr_external_memory_win32
+#define cl_khr_external_memory_win32 1
+/* cl_external_memory_handle_type_khr */
+* cl_khr_external_semaphore
+#define cl_khr_external_semaphore 1
+typedef struct _cl_semaphore_khr * cl_semaphore_khr;
+typedef cl_uint             cl_external_semaphore_handle_type_khr;
+/* cl_platform_info */
+/* cl_device_info */
+/* cl_semaphore_properties_khr */
+#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                0x203F
+typedef cl_int (CL_API_CALL *
+    cl_semaphore_khr sema_object,
+    cl_device_id device,
+    cl_external_semaphore_handle_type_khr handle_type,
+    size_t handle_size,
+    void* handle_ptr,
+    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_semaphore_khr sema_object,
+    cl_device_id device,
+    cl_external_semaphore_handle_type_khr handle_type,
+    size_t handle_size,
+    void* handle_ptr,
+    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;
+* cl_khr_external_semaphore_dx_fence
+#define cl_khr_external_semaphore_dx_fence 1
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR                 0x2059
+* cl_khr_external_semaphore_opaque_fd
+#define cl_khr_external_semaphore_opaque_fd 1
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR                   0x2055
+* cl_khr_external_semaphore_sync_fd
+#define cl_khr_external_semaphore_sync_fd 1
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR                     0x2058
+* cl_khr_external_semaphore_win32
+#define cl_khr_external_semaphore_win32 1
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR                0x2056
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR            0x2057
+* cl_khr_semaphore
+#define cl_khr_semaphore 1
+/* type cl_semaphore_khr */
+typedef cl_properties       cl_semaphore_properties_khr;
+typedef cl_uint             cl_semaphore_info_khr;
+typedef cl_uint             cl_semaphore_type_khr;
+typedef cl_ulong            cl_semaphore_payload_khr;
+/* cl_semaphore_type */
+#define CL_SEMAPHORE_TYPE_BINARY_KHR                        1
+/* cl_platform_info */
+#define CL_PLATFORM_SEMAPHORE_TYPES_KHR                     0x2036
+/* cl_device_info */
+#define CL_DEVICE_SEMAPHORE_TYPES_KHR                       0x204C
+/* cl_semaphore_info_khr */
+#define CL_SEMAPHORE_CONTEXT_KHR                            0x2039
+#define CL_SEMAPHORE_REFERENCE_COUNT_KHR                    0x203A
+#define CL_SEMAPHORE_PROPERTIES_KHR                         0x203B
+#define CL_SEMAPHORE_PAYLOAD_KHR                            0x203C
+/* cl_semaphore_info_khr or cl_semaphore_properties_khr */
+#define CL_SEMAPHORE_TYPE_KHR                               0x203D
+/* cl_command_type */
+#define CL_COMMAND_SEMAPHORE_WAIT_KHR                       0x2042
+#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR                     0x2043
+/* Error codes */
+#define CL_INVALID_SEMAPHORE_KHR                            -1142
+typedef cl_semaphore_khr (CL_API_CALL *
+    cl_context context,
+    const cl_semaphore_properties_khr* sema_props,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (CL_API_CALL *
+    cl_semaphore_khr sema_object,
+    cl_semaphore_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (CL_API_CALL *
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int (CL_API_CALL *
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL
+    cl_context context,
+    const cl_semaphore_properties_khr* sema_props,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_semaphore_khr sema_object,
+    cl_semaphore_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
  * cl_arm_import_memory extension *
@@ -941,12 +1744,20 @@
 #define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM                (1 << 3)
+#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM               (1 << 5)
+#define CL_DEVICE_MAX_WARP_COUNT_ARM                            0x41EA
 /* cl_kernel_info */
+#define CL_KERNEL_MAX_WARP_COUNT_ARM                            0x41E9
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM                0x41E8
 /* cl_queue_properties */
 #define CL_QUEUE_KERNEL_BATCHING_ARM                            0x41E7
@@ -982,14 +1793,43 @@
-* cl_intel_thread_local_exec extension *
+* cl_arm_protected_memory_allocation *
-#define cl_intel_thread_local_exec 1
+#define cl_arm_protected_memory_allocation 1
+* cl_intel_exec_by_local_thread extension *
+#define cl_intel_exec_by_local_thread 1
 #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+* cl_intel_device_attribute_query
+#define cl_intel_device_attribute_query 1
+typedef cl_bitfield         cl_device_feature_capabilities_intel;
+/* cl_device_feature_capabilities_intel */
+#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL                   (1 << 0)
+#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL                   (1 << 1)
+/* cl_device_info */
+#define CL_DEVICE_IP_VERSION_INTEL                          0x4250
+#define CL_DEVICE_ID_INTEL                                  0x4251
+#define CL_DEVICE_NUM_SLICES_INTEL                          0x4252
+#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL            0x4253
+#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL               0x4254
+#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL                  0x4255
+#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL                0x4256
 * cl_intel_device_partition_by_names extension *
@@ -1342,57 +2182,47 @@
 * cl_intel_unified_shared_memory extension *
-/* These APIs are in sync with Revision Q of the cl_intel_unified_shared_memory spec! */
 #define cl_intel_unified_shared_memory 1
-/* cl_device_info */
-#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
-#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
+typedef cl_bitfield         cl_device_unified_shared_memory_capabilities_intel;
+typedef cl_properties 		cl_mem_properties_intel;
+typedef cl_bitfield         cl_mem_alloc_flags_intel;
+typedef cl_uint             cl_mem_info_intel;
+typedef cl_uint             cl_unified_shared_memory_type_intel;
+typedef cl_uint             cl_mem_advice_intel;
-typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
+/* cl_device_info */
+#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL               0x4190
 /* cl_device_unified_shared_memory_capabilities_intel - bitfield */
-#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
+#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL               (1 << 0)
-typedef cl_properties cl_mem_properties_intel;
 /* cl_mem_properties_intel */
-#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195
-typedef cl_bitfield cl_mem_alloc_flags_intel;
+#define CL_MEM_ALLOC_FLAGS_INTEL                            0x4195
 /* cl_mem_alloc_flags_intel - bitfield */
-#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)
-typedef cl_uint cl_mem_info_intel;
+#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL                   (1 << 0)
 /* cl_mem_alloc_info_intel */
-#define CL_MEM_ALLOC_TYPE_INTEL         0x419A
-#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
-#define CL_MEM_ALLOC_SIZE_INTEL         0x419C
-#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
-/* Enum values 0x419E-0x419F are reserved for future queries. */
-typedef cl_uint cl_unified_shared_memory_type_intel;
+#define CL_MEM_ALLOC_TYPE_INTEL                             0x419A
+#define CL_MEM_ALLOC_BASE_PTR_INTEL                         0x419B
+#define CL_MEM_ALLOC_SIZE_INTEL                             0x419C
+#define CL_MEM_ALLOC_DEVICE_INTEL                           0x419D
 /* cl_unified_shared_memory_type_intel */
-#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
-#define CL_MEM_TYPE_HOST_INTEL          0x4197
-#define CL_MEM_TYPE_DEVICE_INTEL        0x4198
-#define CL_MEM_TYPE_SHARED_INTEL        0x4199
-typedef cl_uint cl_mem_advice_intel;
-/* cl_mem_advice_intel */
-/* Enum values 0x4208-0x420F are reserved for future memory advices. */
+#define CL_MEM_TYPE_UNKNOWN_INTEL                           0x4196
+#define CL_MEM_TYPE_HOST_INTEL                              0x4197
+#define CL_MEM_TYPE_DEVICE_INTEL                            0x4198
+#define CL_MEM_TYPE_SHARED_INTEL                            0x4199
 /* cl_kernel_exec_info */
@@ -1401,223 +2231,249 @@
 #define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203
 /* cl_command_type */
-#define CL_COMMAND_MEMFILL_INTEL        0x4204
-#define CL_COMMAND_MEMCPY_INTEL         0x4205
-#define CL_COMMAND_MEMADVISE_INTEL      0x4207
+#define CL_COMMAND_MEMFILL_INTEL                            0x4204
+#define CL_COMMAND_MEMCPY_INTEL                             0x4205
+#define CL_COMMAND_MIGRATEMEM_INTEL                         0x4206
+#define CL_COMMAND_MEMADVISE_INTEL                          0x4207
-extern CL_API_ENTRY void* CL_API_CALL
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
 typedef void* (CL_API_CALL *
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-extern CL_API_ENTRY void* CL_API_CALL
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
+    cl_context context,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 typedef void* (CL_API_CALL *
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-extern CL_API_ENTRY void* CL_API_CALL
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 typedef void* (CL_API_CALL *
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_context context,
-            void* ptr);
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 typedef cl_int (CL_API_CALL *
-            cl_context context,
-            void* ptr);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_context context,
-            void* ptr);
+    cl_context context,
+    void* ptr) ;
 typedef cl_int (CL_API_CALL *
-            cl_context context,
-            void* ptr);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
+    cl_context context,
+    void* ptr) ;
 typedef cl_int (CL_API_CALL *
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
+    cl_context context,
+    const void* ptr,
+    cl_mem_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
 typedef cl_int (CL_API_CALL *
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-typedef cl_int (CL_API_CALL *
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) ;
 typedef cl_int (CL_API_CALL *
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 typedef cl_int (CL_API_CALL *
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-#ifdef CL_VERSION_1_2
-/* Because these APIs use cl_mem_migration_flags, they require
-   OpenCL 1.2: */
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-typedef cl_int (CL_API_CALL *
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-extern CL_API_ENTRY cl_int CL_API_CALL
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+    cl_command_queue command_queue,
+    cl_bool blocking,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 typedef cl_int (CL_API_CALL *
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_advice_intel advice,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY void* CL_API_CALL
+    cl_context context,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
+extern CL_API_ENTRY void* CL_API_CALL
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
+extern CL_API_ENTRY void* CL_API_CALL
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_context context,
+    void* ptr) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_context context,
+    void* ptr) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_context context,
+    const void* ptr,
+    cl_mem_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    cl_bool blocking,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_advice_intel advice,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+#endif /* CL_NO_PROTOTYPES */
+#if defined(CL_VERSION_1_2)
+/* Requires OpenCL 1.2 for cl_mem_migration_flags: */
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+#endif /* CL_NO_PROTOTYPES */
+#endif /* defined(CL_VERSION_1_2) */
+/* deprecated, use clEnqueueMemFillINTEL instead */
+typedef cl_int (CL_API_CALL *
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    cl_int value,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    cl_int value,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+#endif /* CL_NO_PROTOTYPES */
+* cl_intel_mem_alloc_buffer_location
+#define cl_intel_mem_alloc_buffer_location 1
+    "cl_intel_mem_alloc_buffer_location"
+/* cl_mem_properties_intel */
+#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL                  0x419E
+/* cl_mem_alloc_info_intel */
 * cl_intel_create_buffer_with_properties extension *
@@ -1700,6 +2556,76 @@
 #define CL_QUEUE_CAPABILITY_BARRIER_INTEL                   (1 << 25)
 #define CL_QUEUE_CAPABILITY_KERNEL_INTEL                    (1 << 26)
+* cl_intel_queue_no_sync_operations
+#define cl_intel_queue_no_sync_operations 1
+/* addition to cl_command_queue_properties */
+#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL                   (1 << 29)    
+* cl_intel_sharing_format_query
+#define cl_intel_sharing_format_query 1
+* cl_ext_image_requirements_info
+#ifdef CL_VERSION_3_0
+#define cl_ext_image_requirements_info 1
+typedef cl_uint cl_image_requirements_info_ext;
+#define CL_IMAGE_REQUIREMENTS_SIZE_EXT                   0x12B2
+#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT              0x12B3
+#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT             0x12B4
+#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT              0x12B5
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_context                     context,
+    const cl_mem_properties*       properties,
+    cl_mem_flags                   flags,
+    const cl_image_format*         image_format,
+    const cl_image_desc*           image_desc,
+    cl_image_requirements_info_ext param_name,
+    size_t  param_value_size,
+    void*   param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;
+typedef cl_int (CL_API_CALL *
+    cl_context                     context,
+    const cl_mem_properties*       properties,
+    cl_mem_flags                   flags,
+    const cl_image_format*         image_format,
+    const cl_image_desc*           image_desc,
+    cl_image_requirements_info_ext param_name,
+    size_t  param_value_size,
+    void*   param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;
+* cl_ext_image_from_buffer
+#ifdef CL_VERSION_3_0
+#define cl_ext_image_from_buffer 1
 #ifdef __cplusplus
diff --git a/dependencies/ocl-headers/CL/cl_gl.h b/dependencies/ocl-headers/CL/cl_gl.h
index 5ea0fd8..3277465 100644
--- a/dependencies/ocl-headers/CL/cl_gl.h
+++ b/dependencies/ocl-headers/CL/cl_gl.h
@@ -162,6 +162,31 @@
                            cl_GLsync  sync,
                            cl_int *   errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+* cl_intel_sharing_format_query_gl
+#define cl_intel_sharing_format_query_gl 1
+/* when cl_khr_gl_sharing is supported */
+extern CL_API_ENTRY cl_int CL_API_CALL
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
+typedef cl_int (CL_API_CALL *
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
 #ifdef __cplusplus
diff --git a/dependencies/ocl-headers/CL/cl_gl_ext.h b/dependencies/ocl-headers/CL/cl_gl_ext.h
deleted file mode 100644
index 8ec8181..0000000
--- a/dependencies/ocl-headers/CL/cl_gl_ext.h
+++ /dev/null
@@ -1,18 +0,0 @@
- * Copyright (c) 2008-2021 The Khronos Group Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-#include <CL/cl_gl.h>
-#pragma message("All OpenGL-related extensions have been moved into cl_gl.h.  Please include cl_gl.h directly.")
diff --git a/dependencies/ocl-headers/CL/cl_platform.h b/dependencies/ocl-headers/CL/cl_platform.h
index 8ae655d..e7a0d6f 100644
--- a/dependencies/ocl-headers/CL/cl_platform.h
+++ b/dependencies/ocl-headers/CL/cl_platform.h
@@ -135,6 +135,11 @@
 #if (defined (_WIN32) && defined(_MSC_VER))
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlanguage-extension-token"
 /* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */
 /* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */
 #if defined(__clang__) || _MSC_VER >= 1600
@@ -155,6 +160,10 @@
 typedef float                   cl_float;
 typedef double                  cl_double;
+#if defined(__clang__)
+#pragma clang diagnostic pop
 /* Macro names and corresponding values defined by OpenCL */
 #define CL_CHAR_BIT         8
 #define CL_SCHAR_MAX        127
@@ -501,25 +510,26 @@
 #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
 #define  __CL_HAS_ANON_STRUCT__ 1
 #define  __CL_ANON_STRUCT__
-#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__)
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__)
 #define  __CL_HAS_ANON_STRUCT__ 1
 #define  __CL_ANON_STRUCT__ __extension__
-#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
-    #if _MSC_VER >= 1500
-   /* Microsoft Developer Studio 2008 supports anonymous structs, but
-    * complains by default. */
-    #define  __CL_HAS_ANON_STRUCT__ 1
-    #define  __CL_ANON_STRUCT__
-   /* Disable warning C4201: nonstandard extension used : nameless
-    * struct/union */
-    #pragma warning( push )
-    #pragma warning( disable : 4201 )
-    #endif
+#elif defined(__clang__)
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
 #define  __CL_HAS_ANON_STRUCT__ 0
 #define  __CL_ANON_STRUCT__
+#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
 /* Define alignment keys */
 #if defined( __GNUC__ ) || defined(__INTEGRITY)
     #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
@@ -1395,10 +1405,8 @@
-#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
-    #if _MSC_VER >=1500
+#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
     #pragma warning( pop )
-    #endif
 #endif  /* __CL_PLATFORM_H  */
diff --git a/dependencies/ocl-stubs/ b/dependencies/ocl-stubs/
index 8cc0954..8cdbc40 100644
--- a/dependencies/ocl-stubs/
+++ b/dependencies/ocl-stubs/
@@ -80,7 +80,7 @@
 def parse_api(api_signature):
   m = None
-  api_signature = re.sub('extern', '', api_signature)
+  api_signature = re.sub(r'\bextern\b', '', api_signature)
   api_signature = re.sub('CL_\w+', '', api_signature)
   m = re.match(r'\s*(.*)\s+(\w+)\((.*)\)\s*;', api_signature)
diff --git a/dependencies/ocl-stubs/stubs.cpp b/dependencies/ocl-stubs/stubs.cpp
index 2cf3700..fe9a912 100644
--- a/dependencies/ocl-stubs/stubs.cpp
+++ b/dependencies/ocl-stubs/stubs.cpp
@@ -2,7 +2,6 @@
 #include <CL/cl_gl.h>
 #include <CL/cl_egl.h>
 #include <CL/cl_ext.h>
-#include <CL/cl_gl_ext.h>
 #include <dlfcn.h>
@@ -31,4 +30,3 @@
 #include "apis.h"
 #undef CL_MACRO
diff --git a/ b/
index 6fc037c..ca39b9a 100755
--- a/
+++ b/
@@ -14,8 +14,11 @@
+cmake --version
 # Prepare toolchain if needed
-if [[ ${JOB_ARCHITECTURE} != "" ]]; then
+if [[ ${JOB_ARCHITECTURE} != "" && ${RUNNER_OS} != "Windows" ]]; then
     wget ${TOOLCHAIN_URL}
@@ -38,35 +41,67 @@
 if [[ ( ${JOB_ARCHITECTURE} == "" && ${JOB_ENABLE_GL} == "1" ) ]]; then
-    sudo apt-get update
-    sudo apt-get -y install libglu1-mesa-dev freeglut3-dev mesa-common-dev libglew-dev
-# Prepare headers
-git clone
-cd OpenCL-Headers
-ln -s CL OpenCL # For OSX builds
-cd ..
+if [[ ${JOB_ENABLE_DEBUG} == 1 ]]; then
+    BUILD_CONFIG="Debug"
+    BUILD_CONFIG="Release"
+#Vulkan Headers
+git clone
 # Get and build loader
 git clone
 cd ${TOP}/OpenCL-ICD-Loader
 mkdir build
 cd build
+cmake .. -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+cmake --build . -j2
+#Vulkan Loader
+cd ${TOP}
+git clone
+cd Vulkan-Loader
+mkdir build
+cd build
+python3 ../scripts/
+cmake .. -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DUSE_GAS=OFF \
+      -C helper.cmake ..
+cmake --build . -j2
 # Build CTS
 cd ${TOP}
 ls -l
 mkdir build
 cd build
-cmake -DCL_INCLUDE_DIR=${TOP}/OpenCL-Headers \
+if [[ ${RUNNER_OS} == "Windows" ]]; then
+cmake .. -G Ninja \
+      -DCL_INCLUDE_DIR=${TOP}/OpenCL-Headers \
       -DCL_LIB_DIR=${TOP}/OpenCL-ICD-Loader/build \
-      -DOPENCL_LIBRARIES="-lOpenCL -lpthread" \
-      ..
-make -j2
+      -DVULKAN_INCLUDE_DIR=${TOP}/Vulkan-Headers/include/ \
+      -DVULKAN_LIB_DIR=${TOP}/Vulkan-Loader/build/loader/
+cmake --build . -j3
diff --git a/scripts/android_bp_head b/scripts/android_bp_head
index c5cd394..42c3e2c 100644
--- a/scripts/android_bp_head
+++ b/scripts/android_bp_head
@@ -1,24 +1,7 @@
 package {
     default_applicable_licenses: ["external_OpenCL-CTS_license"],
-// Added automatically by a large-scale-change that took the approach of
-// 'apply every license found to every target'. While this makes sure we respect
-// every license restriction, it may not be entirely correct.
-// e.g. GPL in an MIT project might only apply to the contrib/ directory.
-// Please consider splitting the single license below into multiple licenses,
-// taking care not to lose any license_kind information, and overriding the
-// default license using the 'licenses: [...]' property on targets as needed.
-// For unused files, consider creating a 'fileGroup' with "//visibility:private"
-// to attach the license to, and including a comment whether the files may be
-// used in the current project.
-// See: http://go/android-license-faq
 license {
     name: "external_OpenCL-CTS_license",
     visibility: [":__subpackages__"],
@@ -27,9 +10,6 @@
-        "legacy_by_exception_only", // by exception only
-        "legacy_proprietary", // by exception only
-        "legacy_unencumbered",
     license_text: [
@@ -56,37 +36,19 @@
-        "-Wno-absolute-value",
-        "-Wno-asm-operand-widths",
-        "-Wno-dangling-else",
-        "-Wno-ignored-pragmas",
-        "-Wno-logical-op-parentheses",
-        "-Wno-macro-redefined",
-        "-Wno-missing-declarations",
-        "-Wno-parentheses",
-        "-Wno-parentheses-equality",
-        "-Wno-return-stack-address",
-        "-Wno-shift-negative-value",
-        "-Wno-switch",
-        "-Wno-unknown-pragmas",
-        "-Wno-unneeded-internal-declaration",
-        "-Wno-unused-function",
-        "-Wno-unused-label",
-        "-Wno-unused-variable",
-        "-Wno-writable-strings",
     static_libs: [
@@ -118,4 +80,3 @@
     export_include_dirs: [ "test_conformance/images" ],
     defaults: [ "ocl-test-defaults" ],
diff --git a/scripts/android_bp_tail b/scripts/android_bp_tail
index a073f33..c048873 100644
--- a/scripts/android_bp_tail
+++ b/scripts/android_bp_tail
@@ -4,14 +4,6 @@
     srcs: [ "scripts/" ],
     data: [ "scripts/test_opencl_cts.xml" ],
     test_config: "scripts/test_opencl_cts.xml",
-    version: {
-        py2: {
-            enabled: false,
-        },
-        py3: {
-            enabled: true
-        }
-    },
     test_options: {
         unit_test: false,
@@ -21,14 +13,4 @@
     name: "run_conformance",
     main: "test_conformance/",
     srcs: [ "test_conformance/" ],
-    version: {
-        py2: {
-            enabled: true,
-            embedded_launcher: true,
-        },
-        py3: {
-            enabled: false,
-       }
-    },
diff --git a/scripts/ b/scripts/
index cdb10db..1155a0c 100644
--- a/scripts/
+++ b/scripts/
@@ -1,6 +1,8 @@
 import json
 import os
 import re
+import shutil
+import subprocess
 from xml.dom import minidom
 from xml.etree import ElementTree
@@ -45,7 +47,8 @@
-def generate_android_bp():
+# Return value indicates whether the output should be formatted with bpfmt
+def generate_android_bp() -> bool:
   android_bp_head_path = os.path.join(SCRIPT_DIR, 'android_bp_head')
   android_bp_tail_path = os.path.join(SCRIPT_DIR, 'android_bp_tail')
@@ -61,6 +64,12 @@
     with open(android_bp_tail_path, 'r') as android_bp_tail:
+  if shutil.which('bpfmt') is not None:
+['bpfmt', '-w', 'Android.bp'])
+    return True
+  return False
 def create_subelement_with_attribs(element, tag, attribs):
   subelement = ElementTree.SubElement(element, tag)
@@ -142,12 +151,15 @@
 def main():
-  generate_android_bp()
+  android_bp_formatted = generate_android_bp()
   print("Don't forget to move -")
   print("    Android.bp -> {ANDROID_ROOT}/external/OpenCL-CTS/Android.bp")
   print("    test_opencl_cts.xml -> {ANDROID_ROOT}/external/OpenCL-CTS/scripts/test_opencl_cts.xml")
+  if not android_bp_formatted:
+    print("then run the blueprint autoformatter:")
+    print("    bpfmt -w {ANDROID_ROOT}/external/OpenCL-CTS/Android.bp")
 if __name__ == '__main__':
diff --git a/test_common/CMakeLists.txt b/test_common/CMakeLists.txt
index 2d4bc19..b050534 100644
--- a/test_common/CMakeLists.txt
+++ b/test_common/CMakeLists.txt
@@ -1,6 +1,5 @@
-    harness/threadTesting.cpp
@@ -22,4 +21,3 @@
 add_library(harness STATIC ${HARNESS_SOURCES})
diff --git a/test_common/gl/helpers.cpp b/test_common/gl/helpers.cpp
index def78d7..b9f95a9 100644
--- a/test_common/gl/helpers.cpp
+++ b/test_common/gl/helpers.cpp
@@ -1381,7 +1381,6 @@
   //calculating colors
   double color_delta = 1.0 / (total_layers * samples);
-  double color = color_delta;
   if (attachment != GL_DEPTH_ATTACHMENT && attachment != GL_DEPTH_STENCIL_ATTACHMENT) {
diff --git a/test_common/gl/setup_win32.cpp b/test_common/gl/setup_win32.cpp
index b120a36..708e681 100644
--- a/test_common/gl/setup_win32.cpp
+++ b/test_common/gl/setup_win32.cpp
@@ -13,14 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "setup.h"
 #include "testBase.h"
 #include "harness/errorHelpers.h"
-#include <GL/gl.h>
-#include <GL/glut.h>
 #include <CL/cl_ext.h>
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
diff --git a/test_common/gl/setup_x11.cpp b/test_common/gl/setup_x11.cpp
index c54ecde..abc065c 100644
--- a/test_common/gl/setup_x11.cpp
+++ b/test_common/gl/setup_x11.cpp
@@ -13,16 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "setup.h"
 #include "testBase.h"
 #include "harness/errorHelpers.h"
-#include <GL/gl.h>
-#include <GL/glut.h>
-#include <GL/glext.h>
-#include <GL/freeglut.h>
 #include <GL/glx.h>
 #include <CL/cl_ext.h>
@@ -90,10 +85,17 @@
         for (int i=0; i<(int)num_of_devices; i++) {
-            if (!is_extension_available(devices[i], "cl_khr_gl_sharing ")) {
-                log_info("Device %d of %d does not support required extension cl_khr_gl_sharing.\n", i+1, num_of_devices);
-            } else {
-                log_info("Device %d of %d supports required extension cl_khr_gl_sharing.\n", i+1, num_of_devices);
+            if (!is_extension_available(devices[i], "cl_khr_gl_sharing"))
+            {
+                log_info("Device %d of %d does not support required extension "
+                         "cl_khr_gl_sharing.\n",
+                         i + 1, num_of_devices);
+            }
+            else
+            {
+                log_info("Device %d of %d supports required extension "
+                         "cl_khr_gl_sharing.\n",
+                         i + 1, num_of_devices);
                 found_valid_device = 1;
                 m_devices[m_device_count++] = devices[i];
diff --git a/test_common/gles/helpers.cpp b/test_common/gles/helpers.cpp
index 34f40b4..57a4ddc 100644
--- a/test_common/gles/helpers.cpp
+++ b/test_common/gles/helpers.cpp
@@ -22,7 +22,7 @@
     {GLint __error = glGetError(); if(__error) {log_error( "GL ERROR: %s!\n", gluErrorString( err ));}}
 #if defined(__linux__) || defined(GL_ES_VERSION_2_0)
-// On linux we dont link to GLU library to avoid comaptibility issues with
+// On linux we don't link to GLU library to avoid compatibility issues with
 // libstdc++
 // FIXME: Implement this
 const GLubyte* gluErrorString (GLenum error)
@@ -271,8 +271,6 @@
     // Read results from the GL texture
     glBindTexture(get_base_gl_target(glTarget), glTexture);
-    GLint realWidth, realHeight;
-    GLint realInternalFormat;
     GLenum readBackFormat = GL_RGBA;
     GLenum readBackType = glType;
     glFramebufferWrapper glFramebuffer;
@@ -301,7 +299,7 @@
-    DumpGLBuffer(readBackType, realWidth, realHeight, (void*)outBuffer);
+    DumpGLBuffer(readBackType, outWidth, outHeight, (void *)outBuffer);
diff --git a/test_common/gles/helpers.h b/test_common/gles/helpers.h
index 5bd0fdf..2076878 100644
--- a/test_common/gles/helpers.h
+++ b/test_common/gles/helpers.h
@@ -30,11 +30,10 @@
 #if !defined (__APPLE__)
 #include <CL/cl.h>
-#include "gl_headers.h"
 #include <CL/cl_gl.h>
-#include "gl_headers.h"
+#include <CL/cl_half.h>
+#include "gl_headers.h"
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
diff --git a/test_common/harness/ThreadPool.cpp b/test_common/harness/ThreadPool.cpp
index 31985aa..6279804 100644
--- a/test_common/harness/ThreadPool.cpp
+++ b/test_common/harness/ThreadPool.cpp
@@ -22,6 +22,8 @@
 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
 // or any other POSIX system
+#include <atomic>
 #if defined(_WIN32)
 #include <windows.h>
 #if defined(_MSC_VER)
@@ -241,7 +243,7 @@
 // Condition variable state. How many iterations on the function left to run,
 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might
 // go negative.
-volatile cl_int gRunCount = 0;
+std::atomic<cl_int> gRunCount{ 0 };
 // State that only changes when the threadpool is not working.
 volatile TPFuncPtr gFunc_ptr = NULL;
@@ -261,19 +263,20 @@
 // # of threads intended to be running. Running threads will decrement this
 // as they discover they've run out of work to do.
-volatile cl_int gRunning = 0;
+std::atomic<cl_int> gRunning{ 0 };
 // The total number of threads launched.
-volatile cl_int gThreadCount = 0;
+std::atomic<cl_int> gThreadCount{ 0 };
 #ifdef _WIN32
 void ThreadPool_WorkerFunc(void *p)
 void *ThreadPool_WorkerFunc(void *p)
-    cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
-    cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
-    // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
+    auto &tid = *static_cast<std::atomic<cl_uint> *>(p);
+    cl_uint threadID = tid++;
+    cl_int item = gRunCount--;
     while (MAX_COUNT > item)
@@ -282,8 +285,6 @@
         // check for more work to do
         if (0 >= item)
-            // log_info("Thread %d has run out of work.\n", threadID);
             // No work to do. Attempt to block waiting for work
 #if defined(_WIN32)
@@ -298,9 +299,7 @@
 #endif // !_WIN32
-            cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
-            // log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
-            //          remaining - 1);
+            cl_int remaining = gRunning--;
             if (1 == remaining)
             { // last thread out signal the main thread to wake up
 #if defined(_WIN32)
@@ -350,7 +349,7 @@
 #endif // !_WIN32
                 // try again to get a valid item id
-                item = ThreadPool_AtomicAdd(&gRunCount, -1);
+                item = gRunCount--;
                 if (MAX_COUNT <= item) // exit if we are done
 #if defined(_WIN32)
@@ -362,8 +361,7 @@
-            ThreadPool_AtomicAdd(&gRunning, 1);
-            // log_info("Thread %d has found work.\n", threadID);
+            gRunning++;
 #if defined(_WIN32)
@@ -447,12 +445,12 @@
         // get the next item
-        item = ThreadPool_AtomicAdd(&gRunCount, -1);
+        item = gRunCount--;
     log_info("ThreadPool: thread %d exiting.\n", threadID);
-    ThreadPool_AtomicAdd(&gThreadCount, -1);
+    gThreadCount--;
 #if !defined(_WIN32)
     return NULL;
@@ -487,7 +485,7 @@
     cl_int i;
     int err;
-    volatile cl_uint threadID = 0;
+    std::atomic<cl_uint> threadID{ 0 };
     // Check for manual override of multithreading code. We add this for better
     // debuggability.
@@ -523,7 +521,7 @@
                         // Count the number of bits in ProcessorMask (number of
                         // logical cores)
-                        ULONG mask = ptr->ProcessorMask;
+                        ULONG_PTR mask = ptr->ProcessorMask;
                         while (mask)
@@ -624,7 +622,7 @@
 #endif // !_WIN32
-    gRunning = gThreadCount;
+    gRunning = gThreadCount.load();
     // init threads
     for (i = 0; i < gThreadCount; i++)
@@ -688,7 +686,6 @@
 void ThreadPool_Exit(void)
-    int err, count;
     gRunCount = CL_INT_MAX;
 #if defined(__GNUC__)
@@ -702,13 +699,13 @@
     // spin waiting for threads to die
-    for (count = 0; 0 != gThreadCount && count < 1000; count++)
+    for (int count = 0; 0 != gThreadCount && count < 1000; count++)
 #if defined(_WIN32)
 #else // !_WIN32
-        if ((err = pthread_cond_broadcast(&cond_var)))
+        if (int err = pthread_cond_broadcast(&cond_var))
             log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
                       "work threads. ThreadPool_Exit failed.\n",
@@ -722,7 +719,7 @@
     if (gThreadCount)
         log_error("Error: Thread pool timed out after 1 second with %d threads "
                   "still active.\n",
-                  gThreadCount);
+                  gThreadCount.load());
         log_info("Thread pool exited in a orderly fashion.\n");
@@ -738,7 +735,9 @@
 // all available then it would make more sense to use those features.
 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
+#ifndef _WIN32
     cl_int newErr;
     cl_int err = 0;
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
@@ -913,7 +912,9 @@
     err = jobError;
+#ifndef _WIN32
     // exit critical region
 #if defined(_WIN32)
diff --git a/test_common/harness/alloc.h b/test_common/harness/alloc.h
index 653dde0..3b00d7c 100644
--- a/test_common/harness/alloc.h
+++ b/test_common/harness/alloc.h
@@ -29,7 +29,7 @@
 #include "mingw_compat.h"
-static void* align_malloc(size_t size, size_t alignment)
+inline void* align_malloc(size_t size, size_t alignment)
 #if defined(_WIN32) && defined(_MSC_VER)
     return _aligned_malloc(size, alignment);
@@ -53,7 +53,7 @@
-static void align_free(void* ptr)
+inline void align_free(void* ptr)
 #if defined(_WIN32) && defined(_MSC_VER)
diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h
index 7aad15a..4053b7e 100644
--- a/test_common/harness/compat.h
+++ b/test_common/harness/compat.h
@@ -18,13 +18,13 @@
 #if defined(_WIN32) && defined(_MSC_VER)
 #include <Windows.h>
 #ifdef __cplusplus
 #define EXTERN_C extern "C"
 #define EXTERN_C
@@ -309,13 +309,6 @@
-#ifndef MIN
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#ifndef MAX
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
diff --git a/test_common/harness/conversions.cpp b/test_common/harness/conversions.cpp
index fc3317c..d52a2ac 100644
--- a/test_common/harness/conversions.cpp
+++ b/test_common/harness/conversions.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "conversions.h"
+#include <cinttypes>
 #include <limits.h>
 #include <time.h>
 #include <assert.h>
@@ -50,10 +51,10 @@
         case kInt: sprintf(string, "%d", *((cl_int *)data)); return;
         case kUInt:
         case kUnsignedInt: sprintf(string, "%u", *((cl_uint *)data)); return;
-        case kLong: sprintf(string, "%lld", *((cl_long *)data)); return;
+        case kLong: sprintf(string, "%" PRId64 "", *((cl_long *)data)); return;
         case kULong:
         case kUnsignedLong:
-            sprintf(string, "%llu", *((cl_ulong *)data));
+            sprintf(string, "%" PRIu64 "", *((cl_ulong *)data));
         case kFloat: sprintf(string, "%f", *((cl_float *)data)); return;
         case kHalf: sprintf(string, "half"); return;
@@ -181,8 +182,8 @@
-    0xffffffffffffffffLL,
-    0xffffffffffffffffLL,
+    0xffffffffffffffffULL,
+    0xffffffffffffffffULL,
 }; // Last two values aren't stored here
diff --git a/test_common/harness/deviceInfo.cpp b/test_common/harness/deviceInfo.cpp
index 287a142..97ab8c8 100644
--- a/test_common/harness/deviceInfo.cpp
+++ b/test_common/harness/deviceInfo.cpp
@@ -63,6 +63,40 @@
     return false;
+cl_version get_extension_version(cl_device_id device, const char *extensionName)
+    cl_int err;
+    size_t size;
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, 0, nullptr,
+                          &size);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_"
+                                 "VERSION) failed to return size\n");
+    }
+    std::vector<cl_name_version> extensions(size / sizeof(cl_name_version));
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, size,
+                , &size);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_"
+                                 "VERSION) failed to return value\n");
+    }
+    for (auto &ext : extensions)
+    {
+        if (!strcmp(extensionName,
+        {
+            return ext.version;
+        }
+    }
+    throw std::runtime_error("Extension " + std::string(extensionName)
+                             + " not supported by device!");
 /* Returns a string containing the supported extensions list for a device. */
 std::string get_device_extensions_string(cl_device_id device)
diff --git a/test_common/harness/deviceInfo.h b/test_common/harness/deviceInfo.h
index f8c5580..912dd19 100644
--- a/test_common/harness/deviceInfo.h
+++ b/test_common/harness/deviceInfo.h
@@ -31,6 +31,11 @@
 /* Determines if an extension is supported by a device. */
 int is_extension_available(cl_device_id device, const char *extensionName);
+/* Returns the version of the extension the device supports or throws an
+ * exception if the extension is not supported by the device. */
+cl_version get_extension_version(cl_device_id device,
+                                 const char *extensionName);
 /* Returns a string containing the supported extensions list for a device. */
 std::string get_device_extensions_string(cl_device_id device);
diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index 22a2677..eaccf64 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -18,9 +18,12 @@
 #include <stdlib.h>
 #include <string.h>
+#include <algorithm>
 #include "errorHelpers.h"
 #include "parseParameters.h"
+#include "testHarness.h"
 #include <CL/cl_half.h>
@@ -300,10 +303,6 @@
-#ifndef MAX
-#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
 #if defined(_MSC_VER)
 #define scalbnf(_a, _i) ldexpf(_a, _i)
 #define scalbn(_a, _i) ldexp(_a, _i)
@@ -356,7 +355,7 @@
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            HALF_MANT_DIG - 1 - MAX(ilogb(reference), HALF_MIN_EXP - 1);
+            HALF_MANT_DIG - 1 - std::max(ilogb(reference), HALF_MIN_EXP - 1);
         // Scale the exponent of the error
         return (float)scalbn(testVal - reference, ulp_exp);
@@ -364,7 +363,7 @@
     // reference is a normal power of two or a zero
     int ulp_exp =
-        HALF_MANT_DIG - 1 - MAX(ilogb(reference) - 1, HALF_MIN_EXP - 1);
+        HALF_MANT_DIG - 1 - std::max(ilogb(reference) - 1, HALF_MIN_EXP - 1);
     // Scale the exponent of the error
     return (float)scalbn(testVal - reference, ulp_exp);
@@ -436,7 +435,8 @@
             return 0.0f; // if we are expecting a NaN, any NaN is fine
         // The unbiased exponent of the ulp unit place
-        int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference), FLT_MIN_EXP - 1);
+        int ulp_exp =
+            FLT_MANT_DIG - 1 - std::max(ilogb(reference), FLT_MIN_EXP - 1);
         // Scale the exponent of the error
         return (float)scalbn(testVal - reference, ulp_exp);
@@ -444,7 +444,8 @@
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
-    int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference) - 1, FLT_MIN_EXP - 1);
+    int ulp_exp =
+        FLT_MANT_DIG - 1 - std::max(ilogb(reference) - 1, FLT_MIN_EXP - 1);
     // Scale the exponent of the error
     return (float)scalbn(testVal - reference, ulp_exp);
@@ -512,7 +513,7 @@
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
+            DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1);
         // Scale the exponent of the error
         float result = (float)scalbnl(testVal - reference, ulp_exp);
@@ -528,7 +529,7 @@
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
     int ulp_exp =
-        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
+        DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
     // Scale the exponent of the error
     float result = (float)scalbnl(testVal - reference, ulp_exp);
@@ -564,7 +565,7 @@
             error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,
             test_error(error, "Unable to query context's device size");
-            num_devices = size_ret / sizeof(cl_device_id);
+            num_devices = static_cast<cl_uint>(size_ret / sizeof(cl_device_id));
             device_list = (cl_device_id *)malloc(size_ret);
             if (device_list == NULL)
@@ -690,21 +691,19 @@
-int check_functions_for_offline_compiler(const char *subtestname,
-                                         cl_device_id device)
+bool check_functions_for_offline_compiler(const char *subtestname)
     if (gCompilationMode != kOnline)
-        int nNotRequiredWithOfflineCompiler =
-            sizeof(subtests_to_skip_with_offline_compiler) / sizeof(char *);
-        size_t i;
-        for (i = 0; i < nNotRequiredWithOfflineCompiler; ++i)
+        size_t nNotRequiredWithOfflineCompiler =
+            ARRAY_SIZE(subtests_to_skip_with_offline_compiler);
+        for (size_t i = 0; i < nNotRequiredWithOfflineCompiler; ++i)
             if (!strcmp(subtestname, subtests_to_skip_with_offline_compiler[i]))
-                return 1;
+                return false;
-    return 0;
+    return true;
diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h
index 1944601..80eb3b5 100644
--- a/test_common/harness/errorHelpers.h
+++ b/test_common/harness/errorHelpers.h
@@ -56,17 +56,13 @@
 #define vlog printf
-#define ct_assert(b) ct_assert_i(b, __LINE__)
-#define ct_assert_i(b, line) ct_assert_ii(b, line)
-#define ct_assert_ii(b, line)                                                  \
-    int _compile_time_assertion_on_line_##line[b ? 1 : -1];
 #define test_fail(msg, ...)                                                    \
     {                                                                          \
         log_error(msg, ##__VA_ARGS__);                                         \
         return TEST_FAIL;                                                      \
 #define test_error(errCode, msg) test_error_ret(errCode, msg, errCode)
+#define test_error_fail(errCode, msg) test_error_ret(errCode, msg, TEST_FAIL)
 #define test_error_ret(errCode, msg, retValue)                                 \
     {                                                                          \
         auto errCodeResult = errCode;                                          \
@@ -97,21 +93,6 @@
                         "the device version! (from %s:%d)\n",                  \
                         msg, __FILE__, __LINE__);
-#define test_missing_support_offline_cmpiler(errCode, msg)                     \
-    test_missing_support_offline_cmpiler_ret(errCode, msg, errCode)
-// this macro should always return CL_SUCCESS, but print the skip message on
-// test not supported with offline compiler
-#define test_missing_support_offline_cmpiler_ret(errCode, msg, retValue)       \
-    {                                                                          \
-        if (errCode != CL_SUCCESS)                                             \
-        {                                                                      \
-            log_info("INFO: Subtest %s tests is not supported in offline "     \
-                     "compiler execution path! (from %s:%d)\n",                \
-                     msg, __FILE__, __LINE__);                                 \
-            return TEST_SKIP;                                                  \
-        }                                                                      \
-    }
 // expected error code vs. what we got
 #define test_failure_error(errCode, expectedErrCode, msg)                      \
     test_failure_error_ret(errCode, expectedErrCode, msg,                      \
@@ -186,8 +167,7 @@
 extern const char *GetQueuePropertyName(cl_command_queue_properties properties);
 extern const char *GetDeviceTypeName(cl_device_type type);
-int check_functions_for_offline_compiler(const char *subtestname,
-                                         cl_device_id device);
+bool check_functions_for_offline_compiler(const char *subtestname);
 cl_int OutputBuildLogs(cl_program program, cl_uint num_devices,
                        cl_device_id *device_list);
diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h
index 40826c5..222aa2c 100644
--- a/test_common/harness/fpcontrol.h
+++ b/test_common/harness/fpcontrol.h
@@ -16,6 +16,8 @@
 #ifndef _fpcontrol_h
 #define _fpcontrol_h
+#include <cstdint>
 // In order to get tests for correctly rounded operations (e.g. multiply) to
 // work properly we need to be able to set the reference hardware to FTZ mode if
 // the device hardware is running in that mode.  We have explored all other
@@ -30,7 +32,11 @@
 // that rounding mode.
 #if defined(__APPLE__) || defined(_MSC_VER) || defined(__linux__)              \
     || defined(__MINGW32__)
+#ifdef _MSC_VER
 typedef int FPU_mode_type;
+typedef int64_t FPU_mode_type;
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
 #include <xmmintrin.h>
@@ -39,7 +45,7 @@
 extern __thread fpu_control_t fpu_control;
 // Set the reference hardware floating point unit to FTZ mode
-static inline void ForceFTZ(FPU_mode_type *mode)
+inline void ForceFTZ(FPU_mode_type *mode)
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
@@ -55,7 +61,7 @@
     __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24)));
     // Add 64 bit support
 #elif defined(__aarch64__)
-    unsigned fpscr;
+    uint64_t fpscr;
     __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
     *mode = fpscr;
     __asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24)));
@@ -65,7 +71,7 @@
 // Disable the denorm flush to zero
-static inline void DisableFTZ(FPU_mode_type *mode)
+inline void DisableFTZ(FPU_mode_type *mode)
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
@@ -81,7 +87,7 @@
     __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24)));
     // Add 64 bit support
 #elif defined(__aarch64__)
-    unsigned fpscr;
+    uint64_t fpscr;
     __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
     *mode = fpscr;
     __asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24)));
@@ -91,7 +97,7 @@
 // Restore the reference hardware to floating point state indicated by *mode
-static inline void RestoreFPState(FPU_mode_type *mode)
+inline void RestoreFPState(FPU_mode_type *mode)
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 72a2f0c..f1694e8 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -23,6 +23,7 @@
 #include <malloc.h>
 #include <algorithm>
+#include <cinttypes>
 #include <iterator>
 #if !defined(_WIN32)
 #include <cmath>
@@ -421,7 +422,7 @@
               (int)thirdDim, (int)imageInfo->rowPitch,
                   - (int)imageInfo->width * (int)pixel_size);
-    log_error("Failed at column: %ld   ", where);
+    log_error("Failed at column: %zu   ", where);
     switch (pixel_size)
@@ -454,7 +455,7 @@
                 ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]);
         case 8:
-            log_error("*0x%16.16llx vs. 0x%16.16llx\n",
+            log_error("*0x%16.16" PRIx64 " vs. 0x%16.16" PRIx64 "\n",
                       ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]);
         case 12:
@@ -473,12 +474,53 @@
                       ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]);
-            log_error("Don't know how to print pixel size of %ld\n",
+            log_error("Don't know how to print pixel size of %zu\n",
+size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
+                         const char *bPtr)
+    size_t pixel_size = get_pixel_size(imageInfo->format);
+    size_t column;
+    for (column = 0; column < imageInfo->width; column++)
+    {
+        switch (imageInfo->format->image_channel_data_type)
+        {
+            // If the data type is 101010, then ignore bits 31 and 32 when
+            // comparing the row
+            case CL_UNORM_INT_101010: {
+                cl_uint aPixel = *(cl_uint *)aPtr;
+                cl_uint bPixel = *(cl_uint *)bPtr;
+                if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff))
+                    return column;
+            }
+            break;
+            // If the data type is 555, ignore bit 15 when comparing the row
+            case CL_UNORM_SHORT_555: {
+                cl_ushort aPixel = *(cl_ushort *)aPtr;
+                cl_ushort bPixel = *(cl_ushort *)bPtr;
+                if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column;
+            }
+            break;
+            default:
+                if (memcmp(aPtr, bPtr, pixel_size) != 0) return column;
+                break;
+        }
+        aPtr += pixel_size;
+        bPtr += pixel_size;
+    }
+    // If we didn't find a difference, return the width of the image
+    return column;
 int random_log_in_range(int minV, int maxV, MTdata d)
     double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
@@ -554,8 +596,8 @@
-        ct_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2);
+        static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, "");
+        static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, "");
               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn;
@@ -649,9 +691,6 @@
         _b ^= _a;                                                              \
         _a ^= _b;                                                              \
     } while (0)
-#ifndef MAX
-#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
 void get_max_sizes(
     size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3],
@@ -719,7 +758,7 @@
     if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16;
     size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
-    log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, "
+    log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, "
              "per-allocation limit %gMB.\n",
              maxWidth, maxHeight, isArray ? maxArraySize : maxDepth,
              raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0)));
@@ -760,10 +799,10 @@
     if (image_type == CL_MEM_OBJECT_IMAGE1D)
-        double M = maximum_sizes[0];
+        size_t M = maximum_sizes[0];
         // Store the size
-        sizes[(*numberOfSizes)][0] = (size_t)M;
+        sizes[(*numberOfSizes)][0] = M;
         sizes[(*numberOfSizes)][1] = 1;
         sizes[(*numberOfSizes)][2] = 1;
@@ -777,17 +816,17 @@
             // Determine the size of the fixed dimension
-            double M = maximum_sizes[fixed_dim];
-            double A = max_pixels;
+            size_t M = maximum_sizes[fixed_dim];
+            size_t A = max_pixels;
             int x0_dim = !fixed_dim;
-            double x0 =
+            size_t x0 = static_cast<size_t>(
                 fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M),
-                     maximum_sizes[x0_dim]);
+                     maximum_sizes[x0_dim]));
             // Store the size
-            sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
-            sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
+            sizes[(*numberOfSizes)][fixed_dim] = M;
+            sizes[(*numberOfSizes)][x0_dim] = x0;
             sizes[(*numberOfSizes)][2] = 1;
@@ -802,16 +841,17 @@
             // Determine the size of the fixed dimension
-            double M = maximum_sizes[fixed_dim];
-            double A = max_pixels;
+            size_t M = maximum_sizes[fixed_dim];
+            size_t A = max_pixels;
             // Find two other dimensions, x0 and x1
             int x0_dim = (fixed_dim == 0) ? 1 : 0;
             int x1_dim = (fixed_dim == 2) ? 1 : 2;
             // Choose two other sizes for these dimensions
-            double x0 = fmin(fmin(A / M, maximum_sizes[x0_dim]),
-                             other_sizes[(other_size++) % num_other_sizes]);
+            size_t x0 = static_cast<size_t>(
+                fmin(fmin(A / M, maximum_sizes[x0_dim]),
+                     other_sizes[(other_size++) % num_other_sizes]));
             // GPUs have certain restrictions on minimum width (row alignment)
             // of images which has given us issues testing small widths in this
             // test (say we set width to 3 for testing, and compute size based
@@ -820,8 +860,9 @@
             // width of 16 which doesnt fit in vram). For this purpose we are
             // not testing width < 16 for this test.
             if (x0_dim == 0 && x0 < 16) x0 = 16;
-            double x1 = fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
-                             other_sizes[(other_size++) % num_other_sizes]);
+            size_t x1 = static_cast<size_t>(
+                fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
+                     other_sizes[(other_size++) % num_other_sizes]));
             // Valid image sizes cannot be below 1. Due to the workaround for
             // the xo_dim where x0 is overidden to 16 there might not be enough
@@ -834,9 +875,9 @@
             assert(x0 > 0 && M > 0);
             // Store the size
-            sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
-            sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
-            sizes[(*numberOfSizes)][x1_dim] = (size_t)x1;
+            sizes[(*numberOfSizes)][fixed_dim] = M;
+            sizes[(*numberOfSizes)][x0_dim] = x0;
+            sizes[(*numberOfSizes)][x1_dim] = x1;
@@ -847,20 +888,20 @@
         switch (image_type)
             case CL_MEM_OBJECT_IMAGE1D:
-                log_info(" size[%d] = [%ld] (%g MB image)\n", j, sizes[j][0],
+                log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
             case CL_MEM_OBJECT_IMAGE2D:
-                log_info(" size[%d] = [%ld %ld] (%g MB image)\n", j,
+                log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j,
                          sizes[j][0], sizes[j][1],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
             case CL_MEM_OBJECT_IMAGE3D:
-                log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n", j,
+                log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j,
                          sizes[j][0], sizes[j][1], sizes[j][2],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
@@ -884,6 +925,8 @@
         case CL_SFIXED14_APPLE: return 0x1.0p-14f;
+        case CL_UNORM_SHORT_555:
+        case CL_UNORM_SHORT_565: return 1.0f / 31.0f;
         default: return 0.0f;
@@ -1124,12 +1167,13 @@
 char *generate_random_image_data(image_descriptor *imageInfo,
                                  BufferOwningPtr<char> &P, MTdata d)
-    size_t allocSize = get_image_size(imageInfo);
+    size_t allocSize = static_cast<size_t>(get_image_size(imageInfo));
     size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format);
     size_t i;
     if (imageInfo->num_mip_levels > 1)
-        allocSize = compute_mipmapped_image_size(*imageInfo);
+        allocSize =
+            static_cast<size_t>(compute_mipmapped_image_size(*imageInfo));
 #if defined(__APPLE__)
     char *data = NULL;
@@ -1161,7 +1205,7 @@
     if (data == NULL)
-        log_error("ERROR: Unable to malloc %lu bytes for "
+        log_error("ERROR: Unable to malloc %zu bytes for "
         return 0;
@@ -1678,24 +1722,26 @@
     // At this point, we're dealing with non-normalized coordinates.
-    outX = adFn(floorf(x), width);
+    outX = adFn(static_cast<int>(floorf(x)), width);
     // 1D and 2D arrays require special care for the index coordinate:
     switch (imageInfo->type)
-            outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
-            outZ = 0.0f; /* don't care! */
+            outY = static_cast<int>(
+                calculate_array_index(y, (float)imageInfo->arraySize - 1.0f));
+            outZ = 0; /* don't care! */
-            outY = adFn(floorf(y), height);
-            outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
+            outY = adFn(static_cast<int>(floorf(y)), height);
+            outZ = static_cast<int>(
+                calculate_array_index(z, (float)imageInfo->arraySize - 1.0f));
             // legacy path:
-            if (height != 0) outY = adFn(floorf(y), height);
-            if (depth != 0) outZ = adFn(floorf(z), depth);
+            if (height != 0) outY = adFn(static_cast<int>(floorf(y)), height);
+            if (depth != 0) outZ = adFn(static_cast<int>(floorf(z)), depth);
     return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ);
@@ -1766,7 +1812,7 @@
     switch (addressing_mode)
         case CL_ADDRESS_REPEAT:
-            ret = RepeatNormalizedAddressFn(coord, extent);
+            ret = RepeatNormalizedAddressFn(coord, static_cast<size_t>(extent));
             if (verbose)
@@ -1790,7 +1836,8 @@
-            ret = MirroredRepeatNormalizedAddressFn(coord, extent);
+            ret = MirroredRepeatNormalizedAddressFn(
+                coord, static_cast<size_t>(extent));
             if (verbose)
@@ -1948,7 +1995,7 @@
             case CL_MEM_OBJECT_IMAGE1D:
             case CL_MEM_OBJECT_IMAGE1D_BUFFER:
-                log_info("Starting coordinate: %f\b", x);
+                log_info("Starting coordinate: %f\n", x);
             case CL_MEM_OBJECT_IMAGE2D:
                 log_info("Starting coordinate: %f, %f\n", x, y);
@@ -1968,13 +2015,13 @@
         // coordinates.  Note that the array cases again require special
         // care, per section 8.4 in the OpenCL 1.2 Specification.
-        ix = adFn(floorf(x), width_lod);
+        ix = adFn(static_cast<int>(floorf(x)), width_lod);
         switch (imageInfo->type)
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                iy =
-                    calculate_array_index(y, (float)(imageInfo->arraySize - 1));
+                iy = static_cast<int>(calculate_array_index(
+                    y, (float)(imageInfo->arraySize - 1)));
                 iz = 0;
                 if (verbose)
@@ -1982,18 +2029,18 @@
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                iy = adFn(floorf(y), height_lod);
-                iz =
-                    calculate_array_index(z, (float)(imageInfo->arraySize - 1));
+                iy = adFn(static_cast<int>(floorf(y)), height_lod);
+                iz = static_cast<int>(calculate_array_index(
+                    z, (float)(imageInfo->arraySize - 1)));
                 if (verbose)
                     log_info("\tArray index %f evaluates to %d\n", z, iz);
-                iy = adFn(floorf(y), height_lod);
+                iy = adFn(static_cast<int>(floorf(y)), height_lod);
                 if (depth_lod != 0)
-                    iz = adFn(floorf(z), depth_lod);
+                    iz = adFn(static_cast<int>(floorf(z)), depth_lod);
                     iz = 0;
@@ -2047,16 +2094,16 @@
                 height = 1;
-            int x1 = adFn(floorf(x - 0.5f), width);
+            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width);
             int y1 = 0;
-            int x2 = adFn(floorf(x - 0.5f) + 1, width);
+            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width);
             int y2 = 0;
             if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D)
                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER))
-                y1 = adFn(floorf(y - 0.5f), height);
-                y2 = adFn(floorf(y - 0.5f) + 1, height);
+                y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height);
+                y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height);
@@ -2147,12 +2194,12 @@
             // 3D linear filtering
-            int x1 = adFn(floorf(x - 0.5f), width_lod);
-            int y1 = adFn(floorf(y - 0.5f), height_lod);
-            int z1 = adFn(floorf(z - 0.5f), depth_lod);
-            int x2 = adFn(floorf(x - 0.5f) + 1, width_lod);
-            int y2 = adFn(floorf(y - 0.5f) + 1, height_lod);
-            int z2 = adFn(floorf(z - 0.5f) + 1, depth_lod);
+            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width_lod);
+            int y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height_lod);
+            int z1 = adFn(static_cast<int>(floorf(z - 0.5f)), depth_lod);
+            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width_lod);
+            int y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height_lod);
+            int z2 = adFn(static_cast<int>(floorf(z - 0.5f) + 1), depth_lod);
             if (verbose)
                 log_info("\tActual integer coords used (i = floor(x-.5)): "
@@ -2580,11 +2627,11 @@
-int round_to_even(float v)
+cl_int round_to_even(float v)
     // clamp overflow
-    if (v >= -(float)INT_MIN) return INT_MAX;
-    if (v <= (float)INT_MIN) return INT_MIN;
+    if (v >= -(float)CL_INT_MIN) return CL_INT_MAX;
+    if (v <= (float)CL_INT_MIN) return CL_INT_MIN;
     // round fractional values to integer value
     if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23))
@@ -2596,7 +2643,7 @@
         v -= magicVal;
-    return (int)v;
+    return (cl_int)v;
 void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat,
@@ -2721,10 +2768,7 @@
         case CL_SIGNED_INT32: {
             cl_int *ptr = (cl_int *)outData;
             for (unsigned int i = 0; i < channelCount; i++)
-                ptr[i] = (int)CONVERT_INT(
-                    srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31),
-                    MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23),
-                    CL_INT_MAX);
+                ptr[i] = round_to_even(srcVector[i]);
         case CL_UNSIGNED_INT8: {
@@ -2888,26 +2932,25 @@
         case CL_SIGNED_INT32: {
             const cl_int *ptr = (const cl_int *)results;
             for (unsigned int i = 0; i < channelCount; i++)
-                errors[i] = (cl_float)(
-                    (cl_long)ptr[i]
-                    - (cl_long)CONVERT_INT(
-                        srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31),
-                        MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23),
-                        CL_INT_MAX));
+                errors[i] = (cl_float)((cl_long)ptr[i]
+                                       - (cl_long)round_to_even(srcVector[i]));
         case CL_UNSIGNED_INT8: {
             const cl_uchar *ptr = (const cl_uchar *)results;
             for (unsigned int i = 0; i < channelCount; i++)
-                errors[i] = (cl_int)ptr[i]
-                    - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
+                errors[i] = static_cast<float>(
+                    (cl_int)ptr[i]
+                    - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX));
         case CL_UNSIGNED_INT16: {
             const cl_ushort *ptr = (const cl_ushort *)results;
             for (unsigned int i = 0; i < channelCount; i++)
-                errors[i] = (cl_int)ptr[i]
-                    - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX);
+                errors[i] = static_cast<float>(
+                    (cl_int)ptr[i]
+                    - (cl_int)CONVERT_UINT(srcVector[i], 32767.f,
+                                           CL_USHRT_MAX));
         case CL_UNSIGNED_INT32: {
@@ -3228,7 +3271,7 @@
     if (data == NULL)
-            "ERROR: Unable to malloc %lu bytes for create_random_image_data\n",
+            "ERROR: Unable to malloc %zu bytes for create_random_image_data\n",
         return NULL;
@@ -3988,7 +4031,8 @@
 cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth)
-    cl_uint retMaxMipLevels = 0, max_dim = 0;
+    cl_uint retMaxMipLevels = 0;
+    size_t max_dim = 0;
     max_dim = width;
     max_dim = height > max_dim ? height : max_dim;
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index 848ec65..f8ae4fb 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -63,7 +63,7 @@
     bool normalized_coords;
 } image_sampler_data;
-int round_to_even(float v);
+cl_int round_to_even(float v);
 #define NORMALIZE(v, max) (v < 0 ? 0 : (v > 1.f ? max : round_to_even(v * max)))
 #define NORMALIZE_UNROUNDED(v, max) (v < 0 ? 0 : (v > 1.f ? max : v * max))
@@ -139,6 +139,9 @@
                                         image_descriptor *imageInfo, size_t y,
                                         size_t thirdDim);
+size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
+                         const char *bPtr);
 void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
                    size_t sizes[][3], size_t maxWidth, size_t maxHeight,
                    size_t maxDepth, size_t maxArraySize,
@@ -479,6 +482,13 @@
         outData[2] = tempData[3];
         outData[3] = tempData[0];
+    else if (format->image_channel_order == CL_ABGR)
+    {
+        outData[0] = tempData[3];
+        outData[1] = tempData[2];
+        outData[2] = tempData[1];
+        outData[3] = tempData[0];
+    }
     else if ((format->image_channel_order == CL_BGRA)
              || (format->image_channel_order == CL_sBGRA))
diff --git a/test_common/harness/integer_ops_test_info.h b/test_common/harness/integer_ops_test_info.h
new file mode 100644
index 0000000..ad7b303
--- /dev/null
+++ b/test_common/harness/integer_ops_test_info.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "conversions.h"
+#include "testHarness.h"
+// TODO: expand usage to other tests.
+template <typename T> struct TestInfo
+template <> struct TestInfo<cl_char>
+    static const ExplicitType explicitType = kChar;
+    static constexpr const char* deviceTypeName = "char";
+    static constexpr const char* deviceTypeNameSigned = "char";
+    static constexpr const char* deviceTypeNameUnsigned = "uchar";
+template <> struct TestInfo<cl_uchar>
+    static const ExplicitType explicitType = kUChar;
+    static constexpr const char* deviceTypeName = "uchar";
+    static constexpr const char* deviceTypeNameSigned = "char";
+    static constexpr const char* deviceTypeNameUnsigned = "uchar";
+template <> struct TestInfo<cl_short>
+    static const ExplicitType explicitType = kShort;
+    static constexpr const char* deviceTypeName = "short";
+    static constexpr const char* deviceTypeNameSigned = "short";
+    static constexpr const char* deviceTypeNameUnsigned = "ushort";
+template <> struct TestInfo<cl_ushort>
+    static const ExplicitType explicitType = kUShort;
+    static constexpr const char* deviceTypeName = "ushort";
+    static constexpr const char* deviceTypeNameSigned = "short";
+    static constexpr const char* deviceTypeNameUnsigned = "ushort";
+template <> struct TestInfo<cl_int>
+    static const ExplicitType explicitType = kInt;
+    static constexpr const char* deviceTypeName = "int";
+    static constexpr const char* deviceTypeNameSigned = "int";
+    static constexpr const char* deviceTypeNameUnsigned = "uint";
+template <> struct TestInfo<cl_uint>
+    static const ExplicitType explicitType = kUInt;
+    static constexpr const char* deviceTypeName = "uint";
+    static constexpr const char* deviceTypeNameSigned = "int";
+    static constexpr const char* deviceTypeNameUnsigned = "uint";
+template <> struct TestInfo<cl_long>
+    static const ExplicitType explicitType = kLong;
+    static constexpr const char* deviceTypeName = "long";
+    static constexpr const char* deviceTypeNameSigned = "long";
+    static constexpr const char* deviceTypeNameUnsigned = "ulong";
+template <> struct TestInfo<cl_ulong>
+    static const ExplicitType explicitType = kULong;
+    static constexpr const char* deviceTypeName = "ulong";
+    static constexpr const char* deviceTypeNameSigned = "long";
+    static constexpr const char* deviceTypeNameUnsigned = "ulong";
+template <typename T>
+static void fill_vector_with_random_data(std::vector<T>& v)
+    MTdataHolder d(gRandomSeed);
+    generate_random_data(TestInfo<T>::explicitType, v.size(), d,;
diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index 95b9555..13ebcbc 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -530,7 +530,7 @@
                                                 sourceFilename, outputFilename);
             if (error != CL_SUCCESS) return error;
-            // read output file
+            // open output file for reading
   , std::ios::binary);
             if (!ifs.good())
@@ -540,6 +540,26 @@
+    if (compilationMode == kSpir_v && !gDisableSPIRVValidation)
+    {
+        std::string runString = gSPIRVValidator + " " + outputFilename;
+        int returnCode = system(runString.c_str());
+        if (returnCode == -1)
+        {
+            log_error("Error: failed to invoke SPIR-V validator\n");
+            return CL_COMPILE_PROGRAM_FAILURE;
+        }
+        else if (returnCode != 0)
+        {
+            log_error(
+                "Failed to validate SPIR-V file %s: system() returned 0x%x\n",
+                outputFilename.c_str(), returnCode);
+            return CL_COMPILE_PROGRAM_FAILURE;
+        }
+    }
     return CL_SUCCESS;
@@ -579,7 +599,7 @@
     if (error != CL_SUCCESS) return error;
     ifs.seekg(0, ifs.end);
-    int length = ifs.tellg();
+    size_t length = static_cast<size_t>(ifs.tellg());
     ifs.seekg(0, ifs.beg);
     // treat modifiedProgram as input for clCreateProgramWithBinary
@@ -1226,7 +1246,7 @@
     list = (cl_image_format *)malloc(count * sizeof(cl_image_format));
     if (NULL == list)
-        log_error("Error: unable to allocate %ld byte buffer for image format "
+        log_error("Error: unable to allocate %zu byte buffer for image format "
                   "list at %s:%d (err = %d)\n",
                   count * sizeof(cl_image_format), __FILE__, __LINE__, err);
         return 0;
@@ -1641,8 +1661,10 @@
         Version max_supported_cl_c_version{};
         for (const auto &name_version : name_versions)
-            Version current_version{ CL_VERSION_MAJOR(name_version.version),
-                                     CL_VERSION_MINOR(name_version.version) };
+            Version current_version{
+                static_cast<int>(CL_VERSION_MAJOR(name_version.version)),
+                static_cast<int>(CL_VERSION_MINOR(name_version.version))
+            };
             max_supported_cl_c_version =
                 (current_version > max_supported_cl_c_version)
                 ? current_version
@@ -1687,7 +1709,7 @@
                           current_version =
-                              (std::min)(device_version, current_version);
+                              std::min(device_version, current_version);
     return current_version;
@@ -1725,8 +1747,10 @@
         for (const auto &name_version : name_versions)
-            Version current_version{ CL_VERSION_MAJOR(name_version.version),
-                                     CL_VERSION_MINOR(name_version.version) };
+            Version current_version{
+                static_cast<int>(CL_VERSION_MAJOR(name_version.version)),
+                static_cast<int>(CL_VERSION_MINOR(name_version.version))
+            };
             if (current_version == version)
                 return true;
diff --git a/test_common/harness/mt19937.cpp b/test_common/harness/mt19937.cpp
index c32d9ba..f5665de 100644
--- a/test_common/harness/mt19937.cpp
+++ b/test_common/harness/mt19937.cpp
@@ -277,3 +277,5 @@
     unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
     return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
+bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
diff --git a/test_common/harness/mt19937.h b/test_common/harness/mt19937.h
index 35c8493..447ca25 100644
--- a/test_common/harness/mt19937.h
+++ b/test_common/harness/mt19937.h
@@ -90,24 +90,46 @@
 #ifdef __cplusplus
-#include <cassert>
+/* generates a random boolean */
+bool genrand_bool(MTdata /*data*/);
-struct MTdataHolder
-    MTdataHolder(cl_uint seed)
+#include <cassert>
+#include <utility>
+class MTdataHolder {
+    MTdataHolder() = default;
+    explicit MTdataHolder(cl_uint seed)
         m_mtdata = init_genrand(seed);
         assert(m_mtdata != nullptr);
-    MTdataHolder(MTdata mtdata): m_mtdata(mtdata) {}
+    // Forbid copy.
+    MTdataHolder(const MTdataHolder&) = delete;
+    MTdataHolder& operator=(const MTdataHolder&) = delete;
-    ~MTdataHolder() { free_mtdata(m_mtdata); }
+    // Support move semantics.
+    MTdataHolder(MTdataHolder&& h) { std::swap(m_mtdata, h.m_mtdata); }
+    MTdataHolder& operator=(MTdataHolder&& h)
+    {
+        std::swap(m_mtdata, h.m_mtdata);
+        return *this;
+    }
-    operator MTdata() const { return m_mtdata; }
+    ~MTdataHolder()
+    {
+        if (m_mtdata) free_mtdata(m_mtdata);
+    }
+    operator MTdata() const
+    {
+        assert(m_mtdata && "Object wasn't initialised");
+        return m_mtdata;
+    }
-    MTdata m_mtdata;
+    MTdata m_mtdata = nullptr;
 #endif // #ifdef __cplusplus
diff --git a/test_common/harness/os_helpers.cpp b/test_common/harness/os_helpers.cpp
index cd350cf..8fc9110 100644
--- a/test_common/harness/os_helpers.cpp
+++ b/test_common/harness/os_helpers.cpp
@@ -333,9 +333,6 @@
 #include <windows.h>
-#if defined(max)
-#undef max
 #include <cctype>
 #include <algorithm>
@@ -404,7 +401,8 @@
     for (;;)
-        DWORD len = GetModuleFileNameA(NULL, &path.front(), path.size());
+        DWORD len = GetModuleFileNameA(NULL, &path.front(),
+                                       static_cast<DWORD>(path.size()));
         if (len == 0)
diff --git a/test_common/harness/parseParameters.cpp b/test_common/harness/parseParameters.cpp
index b2ab5b0..e946d74 100644
--- a/test_common/harness/parseParameters.cpp
+++ b/test_common/harness/parseParameters.cpp
@@ -28,11 +28,14 @@
 using namespace std;
 #define DEFAULT_COMPILATION_PROGRAM "cl_offline_compiler"
+#define DEFAULT_SPIRV_VALIDATOR "spirv-val"
 CompilationMode gCompilationMode = kOnline;
 CompilationCacheMode gCompilationCacheMode = kCacheModeCompileIfAbsent;
 std::string gCompilationCachePath = ".";
 std::string gCompilationProgram = DEFAULT_COMPILATION_PROGRAM;
+bool gDisableSPIRVValidation = false;
+std::string gSPIRVValidator = DEFAULT_SPIRV_VALIDATOR;
 void helpInfo()
@@ -62,7 +65,14 @@
         Path for offline compiler output and CL source
     --compilation-program <prog>
         Program to use for offline compilation, defaults to:
-            )" DEFAULT_COMPILATION_PROGRAM "\n\n");
+For spir-v mode only:
+    --disable-spirv-validation
+        Disable validation of SPIR-V using the SPIR-V validator
+    --spirv-validator
+        Path for SPIR-V validator, defaults to )" DEFAULT_SPIRV_VALIDATOR "\n"
+        "\n");
 int parseCustomParam(int argc, const char *argv[], const char *ignore)
@@ -198,6 +208,26 @@
                 return -1;
+        else if (!strcmp(argv[i], "--disable-spirv-validation"))
+        {
+            delArg++;
+            gDisableSPIRVValidation = true;
+        }
+        else if (!strcmp(argv[i], "--spirv-validator"))
+        {
+            delArg++;
+            if ((i + 1) < argc)
+            {
+                delArg++;
+                gSPIRVValidator = argv[i + 1];
+            }
+            else
+            {
+                log_error("Program argument for --spirv-validator was not "
+                          "specified.\n");
+                return -1;
+            }
+        }
         // cleaning parameters from argv tab
         for (int j = i; j < argc - delArg; j++) argv[j] = argv[j + delArg];
diff --git a/test_common/harness/parseParameters.h b/test_common/harness/parseParameters.h
index b0f8328..437e12f 100644
--- a/test_common/harness/parseParameters.h
+++ b/test_common/harness/parseParameters.h
@@ -38,6 +38,8 @@
 extern CompilationCacheMode gCompilationCacheMode;
 extern std::string gCompilationCachePath;
 extern std::string gCompilationProgram;
+extern bool gDisableSPIRVValidation;
+extern std::string gSPIRVValidator;
 extern int parseCustomParam(int argc, const char *argv[],
                             const char *ignore = 0);
diff --git a/test_common/harness/propertyHelpers.cpp b/test_common/harness/propertyHelpers.cpp
index 3157ca8..6a10c07 100644
--- a/test_common/harness/propertyHelpers.cpp
+++ b/test_common/harness/propertyHelpers.cpp
@@ -19,6 +19,7 @@
 #include <assert.h>
 #include <algorithm>
+#include <cinttypes>
 #include <vector>
 static bool findProperty(const std::vector<cl_properties>& props,
@@ -97,14 +98,15 @@
             if (!found)
-                log_error("ERROR: expected property 0x%x not found!\n",
+                log_error("ERROR: expected property 0x%" PRIx64 " not found!\n",
                 return TEST_FAIL;
             else if (check_value != queried_value)
-                log_error("ERROR: mis-matched value for property 0x%x: wanted "
-                          "0x%x, got 0x%x\n",
+                log_error("ERROR: mis-matched value for property 0x%" PRIx64
+                          ": wanted "
+                          "0x%" PRIx64 ", got 0x%" PRIx64 "\n",
                           check_prop, check_value, queried_value);
                 return TEST_FAIL;
@@ -113,7 +115,7 @@
         if (queried.size() > check.size())
             log_error("ERROR: all properties found but there are extra "
-                      "properties: expected %d, got %d.\n",
+                      "properties: expected %zu, got %zu.\n",
                       check.size(), queried.size());
             return TEST_FAIL;
diff --git a/test_common/harness/rounding_mode.cpp b/test_common/harness/rounding_mode.cpp
index 681ccdd..1f53147 100644
--- a/test_common/harness/rounding_mode.cpp
+++ b/test_common/harness/rounding_mode.cpp
@@ -48,7 +48,7 @@
     const int *p = int_rounds;
     if (outType == kfloat || outType == kdouble) p = flt_rounds;
-    int fpscr = 0;
+    int64_t fpscr = 0;
     RoundingMode oldRound = get_round();
@@ -59,7 +59,7 @@
 RoundingMode get_round(void)
-    int fpscr;
+    int64_t fpscr;
     int oldRound;
@@ -203,13 +203,13 @@
 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
     union {
-        int i;
+        unsigned int i;
         void *p;
     } u = { _mm_getcsr() };
     _mm_setcsr(u.i | 0x8040);
     return u.p;
 #elif defined(__arm__) || defined(__aarch64__)
-    int fpscr;
+    int64_t fpscr;
     _FPU_SETCW(fpscr | FPSCR_FZ);
     return NULL;
@@ -239,7 +239,7 @@
     } u = { p };
 #elif defined(__arm__) || defined(__aarch64__)
-    int fpscr;
+    int64_t fpscr;
     _FPU_SETCW(fpscr & ~FPSCR_FZ);
 #elif defined(__PPC__)
diff --git a/test_common/harness/rounding_mode.h b/test_common/harness/rounding_mode.h
index 064a3a6..6f52f0a 100644
--- a/test_common/harness/rounding_mode.h
+++ b/test_common/harness/rounding_mode.h
@@ -16,8 +16,6 @@
 #ifndef __ROUNDING_MODE_H__
 #define __ROUNDING_MODE_H__
 #include "compat.h"
 #if (defined(_WIN32) && defined(_MSC_VER))
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 1aec3d0..a309f53 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -60,6 +60,54 @@
 #define DEFAULT_NUM_ELEMENTS 0x4000
+static int saveResultsToJson(const char *suiteName, test_definition testList[],
+                             unsigned char selectedTestList[],
+                             test_status resultTestList[], int testNum)
+    char *fileName = getenv("CL_CONFORMANCE_RESULTS_FILENAME");
+    if (fileName == nullptr)
+    {
+        return EXIT_SUCCESS;
+    }
+    FILE *file = fopen(fileName, "w");
+    if (NULL == file)
+    {
+        log_error("ERROR: Failed to open '%s' for writing results.\n",
+                  fileName);
+        return EXIT_FAILURE;
+    }
+    const char *save_map[] = { "success", "failure" };
+    const char *result_map[] = { "pass", "fail", "skip" };
+    const char *linebreak[] = { "", ",\n" };
+    int add_linebreak = 0;
+    fprintf(file, "{\n");
+    fprintf(file, "\t\"cmd\": \"%s\",\n", suiteName);
+    fprintf(file, "\t\"results\": {\n");
+    for (int i = 0; i < testNum; ++i)
+    {
+        if (selectedTestList[i])
+        {
+            fprintf(file, "%s\t\t\"%s\": \"%s\"", linebreak[add_linebreak],
+                    testList[i].name, result_map[(int)resultTestList[i]]);
+            add_linebreak = 1;
+        }
+    }
+    fprintf(file, "\n");
+    fprintf(file, "\t}\n");
+    fprintf(file, "}\n");
+    int ret = fclose(file) ? EXIT_FAILURE : EXIT_SUCCESS;
+    log_info("Saving results to %s: %s!\n", fileName, save_map[ret]);
+    return ret;
 int runTestHarness(int argc, const char *argv[], int testNum,
                    test_definition testList[], int forceNoContextCreation,
                    cl_command_queue_properties queueProps)
@@ -68,19 +116,28 @@
                                    forceNoContextCreation, queueProps, NULL);
-int skip_init_info(int count)
+int suite_did_not_pass_init(const char *suiteName, test_status status,
+                            int testNum, test_definition testList[])
-    log_info("Test skipped while initialization\n");
-    log_info("SKIPPED %d of %d tests.\n", count, count);
-    return EXIT_SUCCESS;
+    std::vector<unsigned char> selectedTestList(testNum, 1);
+    std::vector<test_status> resultTestList(testNum, status);
+    int ret = saveResultsToJson(suiteName, testList,,
+                      , testNum);
+    log_info("Test %s while initialization\n",
+             status == TEST_SKIP ? "skipped" : "failed");
+    log_info("%s %d of %d tests.\n", status == TEST_SKIP ? "SKIPPED" : "FAILED",
+             testNum, testNum);
+    if (ret != EXIT_SUCCESS)
+    {
+        return ret;
+    }
+    return status == TEST_SKIP ? EXIT_SUCCESS : EXIT_FAILURE;
-int fail_init_info(int count)
-    log_info("Test failed while initialization\n");
-    log_info("FAILED %d of %d tests.\n", count, count);
-    return EXIT_FAILURE;
 void version_expected_info(const char *test_name, const char *api_name,
                            const char *expected_version,
                            const char *device_version)
@@ -470,6 +527,7 @@
         log_error("Invalid device address bit size returned by device.\n");
         return EXIT_FAILURE;
+    const char *suiteName = argv[0];
     if (gCompilationMode == kSpir_v)
         test_status spirv_readiness = check_spirv_compilation_readiness(device);
@@ -478,9 +536,15 @@
             switch (spirv_readiness)
                 case TEST_PASS: break;
-                case TEST_FAIL: return fail_init_info(testNum);
-                case TEST_SKIP: return skip_init_info(testNum);
-                case TEST_SKIPPED_ITSELF: return skip_init_info(testNum);
+                case TEST_FAIL:
+                    return suite_did_not_pass_init(suiteName, TEST_FAIL,
+                                                   testNum, testList);
+                case TEST_SKIP:
+                    return suite_did_not_pass_init(suiteName, TEST_SKIP,
+                                                   testNum, testList);
+                case TEST_SKIPPED_ITSELF:
+                    return suite_did_not_pass_init(suiteName, TEST_SKIP,
+                                                   testNum, testList);
@@ -492,9 +556,15 @@
         switch (status)
             case TEST_PASS: break;
-            case TEST_FAIL: return fail_init_info(testNum);
-            case TEST_SKIP: return skip_init_info(testNum);
-            case TEST_SKIPPED_ITSELF: return skip_init_info(testNum);
+            case TEST_FAIL:
+                return suite_did_not_pass_init(suiteName, TEST_FAIL, testNum,
+                                               testList);
+            case TEST_SKIP:
+                return suite_did_not_pass_init(suiteName, TEST_SKIP, testNum,
+                                               testList);
+            case TEST_SKIPPED_ITSELF:
+                return suite_did_not_pass_init(suiteName, TEST_SKIP, testNum,
+                                               testList);
@@ -574,49 +644,6 @@
     return EXIT_SUCCESS;
-static int saveResultsToJson(const char *fileName, const char *suiteName,
-                             test_definition testList[],
-                             unsigned char selectedTestList[],
-                             test_status resultTestList[], int testNum)
-    FILE *file = fopen(fileName, "w");
-    if (NULL == file)
-    {
-        log_error("ERROR: Failed to open '%s' for writing results.\n",
-                  fileName);
-        return EXIT_FAILURE;
-    }
-    const char *save_map[] = { "success", "failure" };
-    const char *result_map[] = { "pass", "fail", "skip" };
-    const char *linebreak[] = { "", ",\n" };
-    int add_linebreak = 0;
-    fprintf(file, "{\n");
-    fprintf(file, "\t\"cmd\": \"%s\",\n", suiteName);
-    fprintf(file, "\t\"results\": {\n");
-    for (int i = 0; i < testNum; ++i)
-    {
-        if (selectedTestList[i])
-        {
-            fprintf(file, "%s\t\t\"%s\": \"%s\"", linebreak[add_linebreak],
-                    testList[i].name, result_map[(int)resultTestList[i]]);
-            add_linebreak = 1;
-        }
-    }
-    fprintf(file, "\n");
-    fprintf(file, "\t}\n");
-    fprintf(file, "}\n");
-    int ret = fclose(file) ? 1 : 0;
-    log_info("Saving results to %s: %s!\n", fileName, save_map[ret]);
-    return ret;
 static void print_results(int failed, int count, const char *name)
     if (count < failed)
@@ -658,7 +685,6 @@
     int ret = EXIT_SUCCESS;
     unsigned char *selectedTestList = (unsigned char *)calloc(testNum, 1);
-    test_status *resultTestList = NULL;
     if (argc == 1)
@@ -697,24 +723,19 @@
     if (ret == EXIT_SUCCESS)
-        resultTestList =
-            (test_status *)calloc(testNum, sizeof(*resultTestList));
+        std::vector<test_status> resultTestList(testNum, TEST_PASS);
-        callTestFunctions(testList, selectedTestList, resultTestList, testNum,
-                          device, forceNoContextCreation, num_elements,
+        callTestFunctions(testList, selectedTestList,,
+                          testNum, device, forceNoContextCreation, num_elements,
         print_results(gFailCount, gTestCount, "sub-test");
         print_results(gTestsFailed, gTestsFailed + gTestsPassed, "test");
-        char *filename = getenv("CL_CONFORMANCE_RESULTS_FILENAME");
-        if (filename != NULL)
-        {
-            ret = saveResultsToJson(filename, argv[0], testList,
-                                    selectedTestList, resultTestList, testNum);
-        }
+        ret = saveResultsToJson(argv[0], testList, selectedTestList,
+                      , testNum);
-        if (std::any_of(resultTestList, resultTestList + testNum,
+        if (std::any_of(resultTestList.begin(), resultTestList.end(),
                         [](test_status result) {
                             switch (result)
@@ -730,7 +751,6 @@
-    free(resultTestList);
     return ret;
@@ -783,6 +803,14 @@
         return TEST_SKIP;
+    if (!check_functions_for_offline_compiler(
+    {
+        log_info("Subtest %s tests is not supported in offline compiler "
+                 "execution path!\n",
+       ;
+        return TEST_SKIP;
+    }
     /* Create a context to work with, unless we're told not to */
     if (!forceNoContextCreation)
@@ -812,14 +840,12 @@
         if (queue == NULL)
             print_error(error, "Unable to create testing command queue");
+            clReleaseContext(context);
             return TEST_FAIL;
     /* Run the test and print the result */
-    error = check_functions_for_offline_compiler(, deviceToUse);
-    test_missing_support_offline_cmpiler(error,;
     if (test.func == NULL)
         // Skip unimplemented test, can happen when all of the tests are
@@ -1172,7 +1198,7 @@
 void PrintArch(void)
-    vlog("sizeof( void*) = %ld\n", sizeof(void *));
+    vlog("sizeof( void*) = %zu\n", sizeof(void *));
 #if defined(__ppc__)
 #elif defined(__ppc64__)
diff --git a/test_common/harness/threadTesting.cpp b/test_common/harness/threadTesting.cpp
deleted file mode 100644
index 875ee59..0000000
--- a/test_common/harness/threadTesting.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "compat.h"
-#include "threadTesting.h"
-#include "errorHelpers.h"
-#include <stdio.h>
-#include <string.h>
-#if !defined(_WIN32)
-#include <pthread.h>
-#if 0 // Disabed for now
-typedef struct
-    basefn            mFunction;
-    cl_device_id    mDevice;
-    cl_context        mContext;
-    int                mNumElements;
-} TestFnArgs;
-// Thread-based testing. Spawns a new thread to run the given test function,
-// then waits for it to complete. The entire idea is that, if the thread crashes,
-// we can catch it and report it as a failure instead of crashing the entire suite
-void *test_thread_wrapper( void *data )
-    TestFnArgs *args;
-    int retVal;
-    cl_context context;
-    args = (TestFnArgs *)data;
-    /* Create a new context to use (contexts can't cross threads) */
-    context = clCreateContext(NULL, args->mDeviceGroup);
-    if( context == NULL )
-    {
-        log_error("clCreateContext failed for new thread\n");
-        return (void *)(-1);
-    }
-    /* Call function */
-    retVal = args->mFunction( args->mDeviceGroup, args->mDevice, context, args->mNumElements );
-    clReleaseContext( context );
-    return (void *)retVal;
-int test_threaded_function( basefn fnToTest, cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
-    int error;
-    pthread_t threadHdl;
-    void *retVal;
-    TestFnArgs args;
-    args.mFunction = fnToTest;
-    args.mDeviceGroup = deviceGroup;
-    args.mDevice = device;
-    args.mContext = context;
-    args.mNumElements = numElements;
-    error = pthread_create( &threadHdl, NULL, test_thread_wrapper, (void *)&args );
-    if( error != 0 )
-    {
-        log_error( "ERROR: Unable to create thread for testing!\n" );
-        return -1;
-    }
-    /* Thread has been started, now just wait for it to complete (or crash) */
-    error = pthread_join( threadHdl, &retVal );
-    if( error != 0 )
-    {
-        log_error( "ERROR: Unable to join testing thread!\n" );
-        return -1;
-    }
-    return (int)((intptr_t)retVal);
diff --git a/test_common/harness/threadTesting.h b/test_common/harness/threadTesting.h
index 765eabc..2f3c187 100644
--- a/test_common/harness/threadTesting.h
+++ b/test_common/harness/threadTesting.h
@@ -24,8 +24,5 @@
 typedef int (*basefn)(cl_device_id deviceID, cl_context context,
                       cl_command_queue queue, int num_elements);
-extern int test_threaded_function(basefn fnToTest, cl_device_id device,
-                                  cl_context context, cl_command_queue queue,
-                                  int numElements);
-#endif // _threadTesting_h
+#endif // _threadTesting_h
\ No newline at end of file
diff --git a/test_common/harness/typeWrappers.h b/test_common/harness/typeWrappers.h
index 9a58a9d..50c7c93 100644
--- a/test_common/harness/typeWrappers.h
+++ b/test_common/harness/typeWrappers.h
@@ -16,123 +16,135 @@
 #ifndef _typeWrappers_h
 #define _typeWrappers_h
-#include <stdio.h>
-#include <stdlib.h>
 #if !defined(_WIN32)
 #include <sys/mman.h>
 #include "compat.h"
-#include <stdio.h>
 #include "mt19937.h"
 #include "errorHelpers.h"
 #include "kernelHelpers.h"
-/* cl_context wrapper */
+#include <cstdlib>
+#include <type_traits>
-class clContextWrapper {
-    clContextWrapper() { mContext = NULL; }
-    clContextWrapper(cl_context program) { mContext = program; }
-    ~clContextWrapper()
+namespace wrapper_details {
+// clRetain*() and clRelease*() functions share the same type.
+template <typename T> // T should be cl_context, cl_program, ...
+using RetainReleaseType = cl_int CL_API_CALL(T);
+// A generic wrapper class that follows OpenCL retain/release semantics.
+// This Wrapper class implement copy and move semantics, which makes it
+// compatible with standard containers for example.
+// Template parameters:
+//  - T is the cl_* type (e.g. cl_context, cl_program, ...)
+//  - Retain is the clRetain* function (e.g. clRetainContext, ...)
+//  - Release is the clRelease* function (e.g. clReleaseContext, ...)
+template <typename T, RetainReleaseType<T> Retain, RetainReleaseType<T> Release>
+class Wrapper {
+    static_assert(std::is_pointer<T>::value, "T should be a pointer type.");
+    T object = nullptr;
+    void retain()
-        if (mContext != NULL) clReleaseContext(mContext);
+        if (!object) return;
+        auto err = Retain(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clRetain*() failed");
+            std::abort();
+        }
-    clContextWrapper &operator=(const cl_context &rhs)
+    void release()
-        mContext = rhs;
+        if (!object) return;
+        auto err = Release(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clRelease*() failed");
+            std::abort();
+        }
+    }
+    Wrapper() = default;
+    // On initialisation, assume the object has a refcount of one.
+    Wrapper(T object): object(object) {}
+    // On assignment, assume the object has a refcount of one.
+    Wrapper &operator=(T rhs)
+    {
+        reset(rhs);
         return *this;
-    operator cl_context() const { return mContext; }
-    cl_context *operator&() { return &mContext; }
-    bool operator==(const cl_context &rhs) { return mContext == rhs; }
-    cl_context mContext;
-/* cl_program wrapper */
-class clProgramWrapper {
-    clProgramWrapper() { mProgram = NULL; }
-    clProgramWrapper(cl_program program) { mProgram = program; }
-    ~clProgramWrapper()
+    // Copy semantics, increase retain count.
+    Wrapper(Wrapper const &w) { *this = w; }
+    Wrapper &operator=(Wrapper const &w)
-        if (mProgram != NULL) clReleaseProgram(mProgram);
-    }
-    clProgramWrapper &operator=(const cl_program &rhs)
-    {
-        mProgram = rhs;
+        reset(w.object);
+        retain();
         return *this;
-    operator cl_program() const { return mProgram; }
-    cl_program *operator&() { return &mProgram; }
-    bool operator==(const cl_program &rhs) { return mProgram == rhs; }
-    cl_program mProgram;
-/* cl_kernel wrapper */
-class clKernelWrapper {
-    clKernelWrapper() { mKernel = NULL; }
-    clKernelWrapper(cl_kernel kernel) { mKernel = kernel; }
-    ~clKernelWrapper()
+    // Move semantics, directly take ownership.
+    Wrapper(Wrapper &&w) { *this = std::move(w); }
+    Wrapper &operator=(Wrapper &&w)
-        if (mKernel != NULL) clReleaseKernel(mKernel);
-    }
-    clKernelWrapper &operator=(const cl_kernel &rhs)
-    {
-        mKernel = rhs;
+        reset(w.object);
+        w.object = nullptr;
         return *this;
-    operator cl_kernel() const { return mKernel; }
-    cl_kernel *operator&() { return &mKernel; }
+    ~Wrapper() { reset(); }
-    bool operator==(const cl_kernel &rhs) { return mKernel == rhs; }
-    cl_kernel mKernel;
-/* cl_mem (stream) wrapper */
-class clMemWrapper {
-    clMemWrapper() { mMem = NULL; }
-    clMemWrapper(cl_mem mem) { mMem = mem; }
-    ~clMemWrapper()
+    // Release the existing object, if any, and own the new one, if any.
+    void reset(T new_object = nullptr)
-        if (mMem != NULL) clReleaseMemObject(mMem);
+        release();
+        object = new_object;
-    clMemWrapper &operator=(const cl_mem &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_mem() const { return mMem; }
+    operator T() const { return object; }
-    cl_mem *operator&() { return &mMem; }
-    bool operator==(const cl_mem &rhs) { return mMem == rhs; }
-    cl_mem mMem;
+    // Ideally this function should not exist as it breaks encapsulation by
+    // allowing external mutation of the Wrapper internal state. However, too
+    // much code currently relies on this. For example, instead of using T* as
+    // output parameters, existing code can be updated to use Wrapper& instead.
+    T *operator&() { return &object; }
+} // namespace wrapper_details
+using clContextWrapper =
+    wrapper_details::Wrapper<cl_context, clRetainContext, clReleaseContext>;
+using clProgramWrapper =
+    wrapper_details::Wrapper<cl_program, clRetainProgram, clReleaseProgram>;
+using clKernelWrapper =
+    wrapper_details::Wrapper<cl_kernel, clRetainKernel, clReleaseKernel>;
+using clMemWrapper =
+    wrapper_details::Wrapper<cl_mem, clRetainMemObject, clReleaseMemObject>;
+using clCommandQueueWrapper =
+    wrapper_details::Wrapper<cl_command_queue, clRetainCommandQueue,
+                             clReleaseCommandQueue>;
+using clSamplerWrapper =
+    wrapper_details::Wrapper<cl_sampler, clRetainSampler, clReleaseSampler>;
+using clEventWrapper =
+    wrapper_details::Wrapper<cl_event, clRetainEvent, clReleaseEvent>;
 class clProtectedImage {
@@ -183,92 +195,12 @@
     cl_mem *operator&() { return &image; }
-    bool operator==(const cl_mem &rhs) { return image == rhs; }
     void *backingStore;
     size_t backingStoreSize;
     cl_mem image;
-/* cl_command_queue wrapper */
-class clCommandQueueWrapper {
-    clCommandQueueWrapper() { mMem = NULL; }
-    clCommandQueueWrapper(cl_command_queue mem) { mMem = mem; }
-    ~clCommandQueueWrapper()
-    {
-        if (mMem != NULL)
-        {
-            clReleaseCommandQueue(mMem);
-        }
-    }
-    clCommandQueueWrapper &operator=(const cl_command_queue &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_command_queue() const { return mMem; }
-    cl_command_queue *operator&() { return &mMem; }
-    bool operator==(const cl_command_queue &rhs) { return mMem == rhs; }
-    cl_command_queue mMem;
-/* cl_sampler wrapper */
-class clSamplerWrapper {
-    clSamplerWrapper() { mMem = NULL; }
-    clSamplerWrapper(cl_sampler mem) { mMem = mem; }
-    ~clSamplerWrapper()
-    {
-        if (mMem != NULL) clReleaseSampler(mMem);
-    }
-    clSamplerWrapper &operator=(const cl_sampler &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_sampler() const { return mMem; }
-    cl_sampler *operator&() { return &mMem; }
-    bool operator==(const cl_sampler &rhs) { return mMem == rhs; }
-    cl_sampler mMem;
-/* cl_event wrapper */
-class clEventWrapper {
-    clEventWrapper() { mMem = NULL; }
-    clEventWrapper(cl_event mem) { mMem = mem; }
-    ~clEventWrapper()
-    {
-        if (mMem != NULL) clReleaseEvent(mMem);
-    }
-    clEventWrapper &operator=(const cl_event &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_event() const { return mMem; }
-    cl_event *operator&() { return &mMem; }
-    bool operator==(const cl_event &rhs) { return mMem == rhs; }
-    cl_event mMem;
 /* Generic protected memory buffer, for verifying access within bounds */
 class clProtectedArray {
diff --git a/test_conformance/CMakeLists.txt b/test_conformance/CMakeLists.txt
index 363ece8..f9514f1 100644
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -52,6 +52,7 @@
 add_subdirectory( device_timer )
 add_subdirectory( spirv_new )
 add_subdirectory( spir )
+add_subdirectory( vulkan )
 file(GLOB CSV_FILES "opencl_conformance_tests_*.csv")
diff --git a/test_conformance/SVM/test_byte_granularity.cpp b/test_conformance/SVM/test_byte_granularity.cpp
index 403528b..6dbb364 100644
--- a/test_conformance/SVM/test_byte_granularity.cpp
+++ b/test_conformance/SVM/test_byte_granularity.cpp
@@ -58,7 +58,6 @@
   cl_uint     num_devices = 0;
   cl_int      err = CL_SUCCESS;
-  cl_int        rval = CL_SUCCESS;
   err = create_cl_objects(deviceID, &byte_manipulation_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
   if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
diff --git a/test_conformance/SVM/test_cross_buffer_pointers.cpp b/test_conformance/SVM/test_cross_buffer_pointers.cpp
index c1caebb..2baa7ad 100644
--- a/test_conformance/SVM/test_cross_buffer_pointers.cpp
+++ b/test_conformance/SVM/test_cross_buffer_pointers.cpp
@@ -162,7 +162,8 @@
     test_error(error, "clCreateBuffer failed.");
     // this buffer holds the index into the nodes buffer that is used for node allocation
-    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                            sizeof(size_t), NULL, &error);
     test_error(error, "clCreateBuffer failed.");
     // this buffer holds the count of correct nodes which is computed by the verify kernel.
diff --git a/test_conformance/SVM/test_migrate.cpp b/test_conformance/SVM/test_migrate.cpp
index 2a1ce05..f624bcd 100644
--- a/test_conformance/SVM/test_migrate.cpp
+++ b/test_conformance/SVM/test_migrate.cpp
@@ -78,9 +78,6 @@
     cl_uint amem[GLOBAL_SIZE];
     cl_uint bmem[GLOBAL_SIZE];
     cl_uint cmem[GLOBAL_SIZE];
-    cl_uint ramem[GLOBAL_SIZE];
-    cl_uint rbmem[GLOBAL_SIZE];
-    cl_uint rcmem[GLOBAL_SIZE];
     cl_event evs[20];
     const size_t global_size = GLOBAL_SIZE;
diff --git a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
index f26981b..1235816 100644
--- a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
@@ -98,7 +98,9 @@
   cl_int error = CL_SUCCESS;
   log_info("SVM: creating linked list on device: %d ", ci);
-  size_t *pAllocator = (size_t*) clEnqueueMapBuffer(cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  size_t *pAllocator = (size_t *)clEnqueueMapBuffer(
+      cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(size_t),
+      0, NULL, NULL, &error);
   test_error2(error, pAllocator, "clEnqueueMapBuffer failed");
   // reset allocator index
   *pAllocator = numLists;   // the first numLists elements of the nodes array are already allocated (they hold the head of each list).
@@ -206,7 +208,9 @@
     // this buffer holds an index into the nodes buffer, it is used for node allocation
-    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                            sizeof(size_t), NULL, &error);
     test_error(error, "clCreateBuffer failed.");
     error = clGetMemObjectInfo(allocator, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0);
diff --git a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
index a98a880..3350972 100644
--- a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
@@ -47,7 +47,7 @@
   test_error2(error, pNodes, "malloc failed");
   // this allocation holds an index into the nodes buffer, it is used for node allocation
-  size_t* pAllocator = (size_t*) align_malloc(sizeof(cl_int), 128);
+  size_t *pAllocator = (size_t *)align_malloc(sizeof(size_t), 128);
   test_error2(error, pAllocator, "malloc failed");
   // this allocation holds the count of correct nodes, which is computed by the verify kernel.
diff --git a/test_conformance/SVM/test_shared_sub_buffers.cpp b/test_conformance/SVM/test_shared_sub_buffers.cpp
index a79484c..2532886 100644
--- a/test_conformance/SVM/test_shared_sub_buffers.cpp
+++ b/test_conformance/SVM/test_shared_sub_buffers.cpp
@@ -182,7 +182,8 @@
     // this buffer holds the index into the nodes buffer that is used for node allocation
-    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                            sizeof(size_t), NULL, &error);
     test_error(error, "clCreateBuffer failed.");
     // this buffer holds the count of correct nodes which is computed by the verify kernel.
diff --git a/test_conformance/allocations/allocation_fill.cpp b/test_conformance/allocations/allocation_fill.cpp
index a755894..b4ea379 100644
--- a/test_conformance/allocations/allocation_fill.cpp
+++ b/test_conformance/allocations/allocation_fill.cpp
@@ -200,8 +200,10 @@
       result = clFinish(*queue);
       if (result != SUCCEEDED)
-        print_error(error, "clFinish failed after successful enquing filling buffer with data.");
-        return result;
+          print_error(error,
+                      "clFinish failed after successful enqueuing filling "
+                      "buffer with data.");
+          return result;
     } else {
       error = clEnqueueWriteImage(*queue, mem, CL_FALSE, origin, region, 0, 0, data, 0, NULL, &event);
diff --git a/test_conformance/allocations/allocation_functions.cpp b/test_conformance/allocations/allocation_functions.cpp
index 7182c72..827ee10 100644
--- a/test_conformance/allocations/allocation_functions.cpp
+++ b/test_conformance/allocations/allocation_functions.cpp
@@ -37,8 +37,8 @@
   if (size_to_allocate == 0) {
-    log_error("Trying to allcoate a zero sized image.\n");
-    return FAILED_ABORT;
+      log_error("Trying to allocate a zero sized image.\n");
+      return FAILED_ABORT;
   error = clGetDeviceInfo( device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( max_width ), &max_width, NULL );
diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp
index 0dec4c6..43e8127 100644
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -112,6 +112,8 @@
     int number_of_mems_used;
     cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
     cl_ulong global_mem_size = g_global_mem_size ;
+    const bool allocate_image =
+        (alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);
     static const char* alloc_description[] = {
@@ -123,7 +125,7 @@
     // Skip image tests if we don't support images on the device
-    if( alloc_type > BUFFER && checkForImageSupport( device ) )
+    if (allocate_image && checkForImageSupport(device))
         log_info( "Can not test image allocation because device does not support images.\n" );
         return 0;
@@ -132,7 +134,7 @@
     // This section was added in order to fix a bug in the test
     // The test will fail in image allocations as the size requested for the allocation will be much grater than the maximum size allowed for image
-    if( ( alloc_type != BUFFER ) && ( alloc_type != BUFFER_NON_BLOCKING ) )
+    if (allocate_image)
         size_t max_width, max_height;
diff --git a/test_conformance/api/negative_platform.cpp b/test_conformance/api/negative_platform.cpp
index 7d9de5d..861d474 100644
--- a/test_conformance/api/negative_platform.cpp
+++ b/test_conformance/api/negative_platform.cpp
@@ -42,18 +42,9 @@
     cl_platform_id platform = getPlatformFromDevice(deviceID);
-    cl_int err =
-        clGetPlatformInfo(reinterpret_cast<cl_platform_id>(deviceID),
-                          CL_PLATFORM_VERSION, sizeof(char*), nullptr, nullptr);
-    test_failure_error_ret(
-        err, CL_INVALID_PLATFORM,
-        "clGetPlatformInfo should return CL_INVALID_PLATFORM  when: \"platform "
-        "is not a valid platform\" using a valid object which is NOT a "
-        "platform",
-        TEST_FAIL);
     constexpr cl_platform_info INVALID_PARAM_VALUE = 0;
-    err = clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr);
+    cl_int err =
+        clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr);
         err, CL_INVALID_VALUE,
         "clGetPlatformInfo should return CL_INVALID_VALUE when: \"param_name "
diff --git a/test_conformance/api/test_api_min_max.cpp b/test_conformance/api/test_api_min_max.cpp
index 9e981cd..086008d 100644
--- a/test_conformance/api/test_api_min_max.cpp
+++ b/test_conformance/api/test_api_min_max.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -22,33 +22,32 @@
 const char *sample_single_param_kernel[] = {
     "__kernel void sample_test(__global int *src)\n"
-    "    int  tid = get_global_id(0);\n"
+    "    size_t  tid = get_global_id(0);\n"
-    "}\n" };
+    "}\n"
-const char *sample_single_param_write_kernel[] = {
-    "__kernel void sample_test(__global int *src)\n"
-    "{\n"
-    "    int  tid = get_global_id(0);\n"
-    "     src[tid] = tid;\n"
-    "\n"
-    "}\n" };
 const char *sample_read_image_kernel_pattern[] = {
-    "__kernel void sample_test( __global float *result, ",  " )\n"
+    "__kernel void sample_test( __global float *result, ",
+    " )\n"
-    "    int  tid = get_global_id(0);\n"
+    "  sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | "
+    "    size_t  tid = get_global_id(0);\n"
     "    result[0] = 0.0f;\n",
-    "}\n" };
+    "}\n"
 const char *sample_write_image_kernel_pattern[] = {
-    "__kernel void sample_test( ",  " )\n"
+    "__kernel void sample_test( ",
+    " )\n"
-    "    int  tid = get_global_id(0);\n",
+    "    size_t  tid = get_global_id(0);\n",
-    "}\n" };
+    "}\n"
 const char *sample_large_parmam_kernel_pattern[] = {
@@ -57,7 +56,8 @@
     "result[0] = 0;\n"
-    "}\n" };
+    "}\n"
 const char *sample_large_int_parmam_kernel_pattern[] = {
     "__kernel void sample_test(%s, __global int *result)\n"
@@ -65,47 +65,55 @@
     "result[0] = 0;\n"
-    "}\n" };
+    "}\n"
 const char *sample_sampler_kernel_pattern[] = {
-    "__kernel void sample_test( read_only image2d_t src, __global int4 *dst", ", sampler_t sampler%d", ")\n"
+    "__kernel void sample_test( read_only image2d_t src, __global int4 *dst",
+    ", sampler_t sampler%d",
+    ")\n"
-    "    int  tid = get_global_id(0);\n",
-    "     dst[ 0 ] = read_imagei( src, sampler%d, (int2)( 0, 0 ) );\n",
+    "    size_t  tid = get_global_id(0);\n",
+    "    dst[ 0 ] = read_imagei( src, sampler%d, (int2)( 0, 0 ) );\n",
-    "}\n" };
+    "}\n"
 const char *sample_const_arg_kernel[] = {
     "__kernel void sample_test(__constant int *src1, __global int *dst)\n"
-    "    int  tid = get_global_id(0);\n"
+    "    size_t  tid = get_global_id(0);\n"
     "    dst[tid] = src1[tid];\n"
-    "}\n" };
+    "}\n"
 const char *sample_local_arg_kernel[] = {
-    "__kernel void sample_test(__local int *src1, __global int *global_src, __global int *dst)\n"
+    "__kernel void sample_test(__local int *src1, __global int *global_src, "
+    "__global int *dst)\n"
-    "    int  tid = get_global_id(0);\n"
+    "    size_t  tid = get_global_id(0);\n"
     "    src1[tid] = global_src[tid];\n"
     "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
     "    dst[tid] = src1[tid];\n"
-    "}\n" };
+    "}\n"
 const char *sample_const_max_arg_kernel_pattern =
-"__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n"
-"    int  tid = get_global_id(0);\n"
-"    dst[tid] = src1[tid];\n"
+    "__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = src1[tid];\n"
+    "%s"
+    "\n"
+    "}\n";
-int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
     int error, retVal;
     unsigned int maxThreadDim, threadDim, i;
@@ -118,19 +126,24 @@
     /* Get the max thread dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxThreadDim ), &maxThreadDim, NULL );
-    test_error( error, "Unable to get max work item dimensions from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                            sizeof(maxThreadDim), &maxThreadDim, NULL);
+    test_error(error, "Unable to get max work item dimensions from device");
-    if( maxThreadDim < 3 )
+    if (maxThreadDim < 3)
-        log_error( "ERROR: Reported max work item dimensions is less than required! (%d)\n", maxThreadDim );
+        log_error("ERROR: Reported max work item dimensions is less than "
+                  "required! (%d)\n",
+                  maxThreadDim);
         return -1;
     log_info("Reported max thread dimensions of %d.\n", maxThreadDim);
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_param_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_single_param_kernel, "sample_test")
+        != 0)
         return -1;
@@ -138,105 +151,122 @@
     /* Create some I/O streams */
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_int) * 100, NULL, &error);
-    if( streams[0] == NULL )
+    if (streams[0] == NULL)
         log_error("ERROR: Creating test array failed!\n");
         return -1;
     /* Set the arguments */
-    error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel arguments");
     retVal = 0;
     /* Now try running the kernel with up to that many threads */
-    for (threadDim=1; threadDim <= maxThreadDim; threadDim++)
+    for (threadDim = 1; threadDim <= maxThreadDim; threadDim++)
-        threads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim );
-        localThreads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim );
-        for( i = 0; i < maxThreadDim; i++ )
+        threads = (size_t *)malloc(sizeof(size_t) * maxThreadDim);
+        localThreads = (size_t *)malloc(sizeof(size_t) * maxThreadDim);
+        for (i = 0; i < maxThreadDim; i++)
-            threads[ i ] = 1;
+            threads[i] = 1;
             localThreads[i] = 1;
-        error = clEnqueueNDRangeKernel( queue, kernel, maxThreadDim, NULL, threads, localThreads, 0, NULL, &event );
-        test_error( error, "Failed clEnqueueNDRangeKernel");
+        error = clEnqueueNDRangeKernel(queue, kernel, maxThreadDim, NULL,
+                                       threads, localThreads, 0, NULL, &event);
+        test_error(error, "Failed clEnqueueNDRangeKernel");
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
         if (event_status < 0)
             test_error(error, "Kernel execution event returned error");
         /* All done */
-        free( threads );
-        free( localThreads );
+        free(threads);
+        free(localThreads);
     return retVal;
-int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     size_t *deviceMaxWorkItemSize;
     unsigned int maxWorkItemDim;
     /* Get the max work item dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxWorkItemDim ), &maxWorkItemDim, NULL );
-    test_error( error, "Unable to get max work item dimensions from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                            sizeof(maxWorkItemDim), &maxWorkItemDim, NULL);
+    test_error(error, "Unable to get max work item dimensions from device");
-    log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n", maxWorkItemDim);
-    deviceMaxWorkItemSize = (size_t*)malloc(sizeof(size_t)*maxWorkItemDim);
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxWorkItemDim, deviceMaxWorkItemSize, NULL );
-    test_error( error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed" );
+    log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n",
+             maxWorkItemDim);
+    deviceMaxWorkItemSize = (size_t *)malloc(sizeof(size_t) * maxWorkItemDim);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(size_t) * maxWorkItemDim,
+                            deviceMaxWorkItemSize, NULL);
+    test_error(error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed");
     unsigned int i;
     int errors = 0;
-    for(i=0; i<maxWorkItemDim; i++) {
-        if (deviceMaxWorkItemSize[i]<1) {
-            log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i, deviceMaxWorkItemSize[i]);
+    for (i = 0; i < maxWorkItemDim; i++)
+    {
+        if (deviceMaxWorkItemSize[i] < 1)
+        {
+            log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i,
+                      deviceMaxWorkItemSize[i]);
-        } else {
-            log_info("Dimension %d has max work item size %lu\n", i, deviceMaxWorkItemSize[i]);
+        }
+        else
+        {
+            log_info("Dimension %d has max work item size %lu\n", i,
+                     deviceMaxWorkItemSize[i]);
-    if (errors)
-        return -1;
+    if (errors) return -1;
     return 0;
-int test_min_max_work_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_work_group_size(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     int error;
     size_t deviceMaxThreadSize;
     /* Get the max thread dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( deviceMaxThreadSize ), &deviceMaxThreadSize, NULL );
-    test_error( error, "Unable to get max work group size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(deviceMaxThreadSize), &deviceMaxThreadSize,
+                            NULL);
+    test_error(error, "Unable to get max work group size from device");
     log_info("Reported %ld max device work group size.\n", deviceMaxThreadSize);
-    if( deviceMaxThreadSize == 0 )
+    if (deviceMaxThreadSize == 0)
-        log_error( "ERROR: Max work group size is reported as zero!\n" );
+        log_error("ERROR: Max work group size is reported as zero!\n");
         return -1;
     return 0;
-int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_read_image_args(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     int error;
     unsigned int maxReadImages, i;
@@ -245,48 +275,55 @@
     char readArgLine[128], *programSrc;
     const char *readArgPattern = ", read_only image2d_t srcimg%d";
     clKernelWrapper kernel;
-    clMemWrapper    *streams, result;
+    clMemWrapper *streams, result;
     size_t threads[2];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     size_t maxParameterSize;
     cl_event event;
     cl_int event_status;
-    cl_float image_data[4*4];
+    cl_float image_data[4 * 4];
     float image_result = 0.0f;
     float actual_image_result;
     cl_uint minRequiredReadImages = gIsEmbedded ? 8 : 128;
     cl_device_type deviceType;
     image_format_desc.image_channel_order = CL_RGBA;
     image_format_desc.image_channel_data_type = CL_FLOAT;
     /* Get the max read image arg count */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof( maxReadImages ), &maxReadImages, NULL );
-    test_error( error, "Unable to get max read image arg count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS,
+                            sizeof(maxReadImages), &maxReadImages, NULL);
+    test_error(error, "Unable to get max read image arg count from device");
-    if( maxReadImages < minRequiredReadImages )
+    if (maxReadImages < minRequiredReadImages)
-        log_error( "ERROR: Reported max read image arg count is less than required! (%d)\n", maxReadImages );
+        log_error("ERROR: Reported max read image arg count is less than "
+                  "required! (%d)\n",
+                  maxReadImages);
         return -1;
     log_info("Reported %d max read image args.\n", maxReadImages);
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( deviceAddressSize ), &deviceAddressSize, NULL );
-    test_error( error, "Unable to query CL_DEVICE_ADDRESS_BITS for device" );
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS,
+                        sizeof(deviceAddressSize), &deviceAddressSize, NULL);
+    test_error(error, "Unable to query CL_DEVICE_ADDRESS_BITS for device");
     deviceAddressSize /= 8; // convert from bits to bytes
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
     if (!gIsEmbedded && maxReadImages >= 128 && maxParameterSize == 1024)
-        error = clGetDeviceInfo( deviceID, CL_DEVICE_TYPE, sizeof( deviceType ), &deviceType, NULL );
-        test_error( error, "Unable to get device type from device" );
+        error = clGetDeviceInfo(deviceID, CL_DEVICE_TYPE, sizeof(deviceType),
+                                &deviceType, NULL);
+        test_error(error, "Unable to get device type from device");
-        if(deviceType != CL_DEVICE_TYPE_CUSTOM)
+        if (deviceType != CL_DEVICE_TYPE_CUSTOM)
             maxReadImages = 127;
@@ -295,85 +332,107 @@
     maxParameterSize -= deviceAddressSize;
     // Calculate the number we can use
-    if (maxParameterSize/deviceAddressSize < maxReadImages) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/deviceAddressSize));
-        maxReadImages = (unsigned int)(maxParameterSize/deviceAddressSize);
+    if (maxParameterSize / deviceAddressSize < maxReadImages)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / deviceAddressSize));
+        maxReadImages = (unsigned int)(maxParameterSize / deviceAddressSize);
     /* Create a program with that many read args */
-    programSrc = (char *)malloc( strlen( sample_read_image_kernel_pattern[ 0 ] ) + ( strlen( readArgPattern ) + 6 ) * ( maxReadImages ) +
-                                strlen( sample_read_image_kernel_pattern[ 1 ] ) + 1 + 40240);
+    programSrc = (char *)malloc(strlen(sample_read_image_kernel_pattern[0])
+                                + (strlen(readArgPattern) + 6) * (maxReadImages)
+                                + strlen(sample_read_image_kernel_pattern[1])
+                                + 1 + 40240);
-    strcpy( programSrc, sample_read_image_kernel_pattern[ 0 ] );
-    strcat( programSrc, "read_only image2d_t srcimg0" );
-    for( i = 0; i < maxReadImages-1; i++ )
+    strcpy(programSrc, sample_read_image_kernel_pattern[0]);
+    strcat(programSrc, "read_only image2d_t srcimg0");
+    for (i = 0; i < maxReadImages - 1; i++)
-        sprintf( readArgLine, readArgPattern, i+1 );
-        strcat( programSrc, readArgLine );
+        sprintf(readArgLine, readArgPattern, i + 1);
+        strcat(programSrc, readArgLine);
-    strcat( programSrc, sample_read_image_kernel_pattern[ 1 ] );
-    for ( i = 0; i < maxReadImages; i++) {
-        sprintf( readArgLine, "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n", i);
-        strcat( programSrc, readArgLine );
+    strcat(programSrc, sample_read_image_kernel_pattern[1]);
+    for (i = 0; i < maxReadImages; i++)
+    {
+        sprintf(
+            readArgLine,
+            "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n",
+            i);
+        strcat(programSrc, readArgLine);
-    strcat( programSrc, sample_read_image_kernel_pattern[ 2 ] );
+    strcat(programSrc, sample_read_image_kernel_pattern[2]);
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
-    free( programSrc );
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
+    free(programSrc);
     result = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float), NULL,
-    test_error( error, "clCreateBufer failed");
+    test_error(error, "clCreateBufer failed");
     /* Create some I/O streams */
     streams = new clMemWrapper[maxReadImages + 1];
-    for( i = 0; i < maxReadImages; i++ )
+    for (i = 0; i < maxReadImages; i++)
-        image_data[0]=i;
-        image_result+= image_data[0];
-        streams[i] = create_image_2d( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &image_format_desc, 4, 4, 0, image_data, &error );
-        test_error( error, "Unable to allocate test image" );
+        image_data[0] = i;
+        image_result += image_data[0];
+        streams[i] =
+            create_image_2d(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                            &image_format_desc, 4, 4, 0, image_data, &error);
+        test_error(error, "Unable to allocate test image");
-    error = clSetKernelArg( kernel, 0, sizeof( result ), &result );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(result), &result);
+    test_error(error, "Unable to set kernel arguments");
     /* Set the arguments */
-    for( i = 1; i < maxReadImages+1; i++ )
+    for (i = 1; i < maxReadImages + 1; i++)
-        error = clSetKernelArg( kernel, i, sizeof( streams[i-1] ), &streams[i-1] );
-        test_error( error, "Unable to set kernel arguments" );
+        error =
+            clSetKernelArg(kernel, i, sizeof(streams[i - 1]), &streams[i - 1]);
+        test_error(error, "Unable to set kernel arguments");
     /* Now try running the kernel */
     threads[0] = threads[1] = 1;
-    error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed");
+    error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0,
+                                   NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed");
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
-    error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float), &actual_image_result, 0, NULL, NULL);
+    error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float),
+                                &actual_image_result, 0, NULL, NULL);
     test_error(error, "clEnqueueReadBuffer failed");
     delete[] streams;
-    if (actual_image_result != image_result) {
-        log_error("Result failed to verify. Got %g, expected %g.\n", actual_image_result, image_result);
+    if (actual_image_result != image_result)
+    {
+        log_error("Result failed to verify. Got %g, expected %g.\n",
+                  actual_image_result, image_result);
         return 1;
     return 0;
-int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_write_image_args(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     unsigned int maxWriteImages, i;
@@ -381,94 +440,117 @@
     char writeArgLine[128], *programSrc;
     const char *writeArgPattern = ", write_only image2d_t dstimg%d";
     clKernelWrapper kernel;
-    clMemWrapper    *streams;
+    clMemWrapper *streams;
     size_t threads[2];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     size_t maxParameterSize;
     cl_event event;
     cl_int event_status;
     cl_uint minRequiredWriteImages = gIsEmbedded ? 1 : 8;
     image_format_desc.image_channel_order = CL_RGBA;
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     /* Get the max read image arg count */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof( maxWriteImages ), &maxWriteImages, NULL );
-    test_error( error, "Unable to get max write image arg count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS,
+                            sizeof(maxWriteImages), &maxWriteImages, NULL);
+    test_error(error, "Unable to get max write image arg count from device");
-    if( maxWriteImages == 0 )
+    if (maxWriteImages == 0)
-        log_info( "WARNING: Device reports 0 for a max write image arg count (write image arguments unsupported). Skipping test (implicitly passes). This is only valid if the number of image formats is also 0.\n" );
+        log_info(
+            "WARNING: Device reports 0 for a max write image arg count (write "
+            "image arguments unsupported). Skipping test (implicitly passes). "
+            "This is only valid if the number of image formats is also 0.\n");
         return 0;
-    if( maxWriteImages < minRequiredWriteImages )
+    if (maxWriteImages < minRequiredWriteImages)
-        log_error( "ERROR: Reported max write image arg count is less than required! (%d)\n", maxWriteImages );
+        log_error("ERROR: Reported max write image arg count is less than "
+                  "required! (%d)\n",
+                  maxWriteImages);
         return -1;
     log_info("Reported %d max write image args.\n", maxWriteImages);
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_mem) < maxWriteImages) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem)));
-        maxWriteImages = (unsigned int)(maxParameterSize/sizeof(cl_mem));
+    if (maxParameterSize / sizeof(cl_mem) < maxWriteImages)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_mem)));
+        maxWriteImages = (unsigned int)(maxParameterSize / sizeof(cl_mem));
     /* Create a program with that many write args + 1 */
-    programSrc = (char *)malloc( strlen( sample_write_image_kernel_pattern[ 0 ] ) + ( strlen( writeArgPattern ) + 6 ) * ( maxWriteImages + 1 ) +
-                                strlen( sample_write_image_kernel_pattern[ 1 ] ) + 1 + 40240 );
+    programSrc = (char *)malloc(
+        strlen(sample_write_image_kernel_pattern[0])
+        + (strlen(writeArgPattern) + 6) * (maxWriteImages + 1)
+        + strlen(sample_write_image_kernel_pattern[1]) + 1 + 40240);
-    strcpy( programSrc, sample_write_image_kernel_pattern[ 0 ] );
-    strcat( programSrc, "write_only image2d_t dstimg0" );
-    for( i = 1; i < maxWriteImages; i++ )
+    strcpy(programSrc, sample_write_image_kernel_pattern[0]);
+    strcat(programSrc, "write_only image2d_t dstimg0");
+    for (i = 1; i < maxWriteImages; i++)
-        sprintf( writeArgLine, writeArgPattern, i );
-        strcat( programSrc, writeArgLine );
+        sprintf(writeArgLine, writeArgPattern, i);
+        strcat(programSrc, writeArgLine);
-    strcat( programSrc, sample_write_image_kernel_pattern[ 1 ] );
-    for ( i = 0; i < maxWriteImages; i++) {
-        sprintf( writeArgLine, "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n", i);
-        strcat( programSrc, writeArgLine );
+    strcat(programSrc, sample_write_image_kernel_pattern[1]);
+    for (i = 0; i < maxWriteImages; i++)
+    {
+        sprintf(writeArgLine,
+                "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n",
+                i);
+        strcat(programSrc, writeArgLine);
-    strcat( programSrc, sample_write_image_kernel_pattern[ 2 ] );
+    strcat(programSrc, sample_write_image_kernel_pattern[2]);
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
-    free( programSrc );
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
+    free(programSrc);
     /* Create some I/O streams */
     streams = new clMemWrapper[maxWriteImages + 1];
-    for( i = 0; i < maxWriteImages; i++ )
+    for (i = 0; i < maxWriteImages; i++)
-        streams[i] = create_image_2d( context, CL_MEM_READ_WRITE, &image_format_desc, 16, 16, 0, NULL, &error );
-        test_error( error, "Unable to allocate test image" );
+        streams[i] =
+            create_image_2d(context, CL_MEM_READ_WRITE, &image_format_desc, 16,
+                            16, 0, NULL, &error);
+        test_error(error, "Unable to allocate test image");
     /* Set the arguments */
-    for( i = 0; i < maxWriteImages; i++ )
+    for (i = 0; i < maxWriteImages; i++)
-        error = clSetKernelArg( kernel, i, sizeof( streams[i] ), &streams[i] );
-        test_error( error, "Unable to set kernel arguments" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel arguments");
     /* Now try running the kernel */
     threads[0] = threads[1] = 16;
-    error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed.");
+    error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0,
+                                   NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed.");
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
@@ -478,7 +560,8 @@
     return 0;
-int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error;
     cl_ulong maxAllocSize, memSize, minSizeToTry;
@@ -492,61 +575,89 @@
         requiredAllocSize = 128 * 1024 * 1024;
     /* Get the max mem alloc size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get max mem alloc size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get max mem alloc size from device");
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( memSize ), &memSize, NULL );
-    test_error( error, "Unable to get global memory size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(memSize), &memSize, NULL);
+    test_error(error, "Unable to get global memory size from device");
-    if (memSize > (cl_ulong)SIZE_MAX) {
-      memSize = (cl_ulong)SIZE_MAX;
+    if (memSize > (cl_ulong)SIZE_MAX)
+    {
+        memSize = (cl_ulong)SIZE_MAX;
-    if( maxAllocSize < requiredAllocSize)
+    if (maxAllocSize < requiredAllocSize)
-        log_error( "ERROR: Reported max allocation size is less than required %lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n", (requiredAllocSize / 1024) / 1024, maxAllocSize, (maxAllocSize / 1024)/1024, (memSize / 1024)/1024 );
+        log_error("ERROR: Reported max allocation size is less than required "
+                  "%lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n",
+                  (requiredAllocSize / 1024) / 1024, maxAllocSize,
+                  (maxAllocSize / 1024) / 1024, (memSize / 1024) / 1024);
         return -1;
-    requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024)) ? 1024 * 1024 * 1024 : memSize / 4;
+    requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024))
+        ? 1024 * 1024 * 1024
+        : memSize / 4;
     if (gIsEmbedded)
-        requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : requiredAllocSize;
+        requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024)
+            ? 1 * 1024 * 1024
+            : requiredAllocSize;
-    requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024) ? 128 * 1024 * 1024 : requiredAllocSize;
+        requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024)
+            ? 128 * 1024 * 1024
+            : requiredAllocSize;
-    if( maxAllocSize < requiredAllocSize )
+    if (maxAllocSize < requiredAllocSize)
-        log_error( "ERROR: Reported max allocation size is less than required of total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n", maxAllocSize, (maxAllocSize / 1024)/1024, (requiredAllocSize / 1024)/1024 );
+        log_error(
+            "ERROR: Reported max allocation size is less than required of "
+            "total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n",
+            maxAllocSize, (maxAllocSize / 1024) / 1024,
+            (requiredAllocSize / 1024) / 1024);
         return -1;
-    log_info("Reported max allocation size of %lld bytes (%gMB) and global mem size of %lld bytes (%gMB).\n",
-             maxAllocSize, maxAllocSize/(1024.0*1024.0), requiredAllocSize, requiredAllocSize/(1024.0*1024.0));
+    log_info("Reported max allocation size of %lld bytes (%gMB) and global mem "
+             "size of %lld bytes (%gMB).\n",
+             maxAllocSize, maxAllocSize / (1024.0 * 1024.0), requiredAllocSize,
+             requiredAllocSize / (1024.0 * 1024.0));
-    if ( memSize < maxAllocSize ) {
-        log_info("Global memory size is less than max allocation size, using that.\n");
+    if (memSize < maxAllocSize)
+    {
+        log_info("Global memory size is less than max allocation size, using "
+                 "that.\n");
         maxAllocSize = memSize;
-    minSizeToTry = maxAllocSize/16;
-    while (maxAllocSize > (maxAllocSize/4)) {
+    minSizeToTry = maxAllocSize / 16;
+    while (maxAllocSize > (maxAllocSize / 4))
+    {
-        log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
-        memHdl = clCreateBuffer( context, CL_MEM_READ_ONLY, (size_t)maxAllocSize, NULL, &error );
-        if (error == CL_MEM_OBJECT_ALLOCATION_FAILURE || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY) {
-            log_info("\tAllocation failed at size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
+        log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n",
+                 maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0));
+        memHdl = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)maxAllocSize,
+                                NULL, &error);
+            || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY)
+        {
+            log_info("\tAllocation failed at size of %lld bytes (%gMB).\n",
+                     maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0));
             maxAllocSize -= minSizeToTry;
-        test_error( error, "clCreateBuffer failed for maximum sized buffer.");
+        test_error(error, "clCreateBuffer failed for maximum sized buffer.");
         return 0;
-    log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
+    log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize,
+              (double)maxAllocSize / (1024.0 * 1024.0));
     return -1;
-int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
@@ -554,10 +665,8 @@
     cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     cl_uint minRequiredDimension;
-    size_t length;
     auto version = get_device_cl_version(deviceID);
     if (version == Version(1, 0))
@@ -571,16 +680,20 @@
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 2d width from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 2d width from device");
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
-        log_error( "ERROR: Reported max image 2d width is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 2d width is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     log_info("Max reported width is %ld.\n", maxDimension);
@@ -588,34 +701,42 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size %d x 1 = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size %d x 1 = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 maxDimension, 1, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Image 2D creation failed for maximum width" );
+        print_error(error, "Image 2D creation failed for maximum width");
         return -1;
     return 0;
-int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
@@ -623,9 +744,8 @@
     cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     cl_uint minRequiredDimension;
-    size_t length;
     auto version = get_device_cl_version(deviceID);
     if (version == Version(1, 0))
@@ -638,16 +758,20 @@
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 2d height from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 2d height from device");
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
-        log_error( "ERROR: Reported max image 2d height is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 2d height is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     log_info("Max reported height is %ld.\n", maxDimension);
@@ -655,56 +779,67 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, maxDimension, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Image 2D creation failed for maximum height" );
+        print_error(error, "Image 2D creation failed for maximum height");
         return -1;
     return 0;
-int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d width from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d width from device");
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
-        log_error( "ERROR: Reported max image 3d width is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d width is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     log_info("Max reported width is %ld.\n", maxDimension);
@@ -712,56 +847,68 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*2*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 2 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n",
+             (int)maxDimension,
+             (2 * (float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 2, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 maxDimension, 1, 2, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Image 3D creation failed for maximum width" );
+        print_error(error, "Image 3D creation failed for maximum width");
         return -1;
     return 0;
-int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d height from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d height from device");
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
-        log_error( "ERROR: Reported max image 3d height is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d height is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     log_info("Max reported height is %ld.\n", maxDimension);
@@ -769,27 +916,35 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*2*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 2 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n",
+             (int)maxDimension,
+             (2 * (float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 2, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, maxDimension, 2, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Image 3D creation failed for maximum height" );
+        print_error(error, "Image 3D creation failed for maximum height");
         return -1;
@@ -797,29 +952,33 @@
-int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d depth from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d depth from device");
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
-        log_error( "ERROR: Reported max image 3d depth is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d depth is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     log_info("Max reported depth is %ld.\n", maxDimension);
@@ -827,55 +986,67 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, 1, maxDimension, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Image 3D creation failed for maximum depth" );
+        print_error(error, "Image 3D creation failed for maximum depth");
         return -1;
     return 0;
-int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_array_size(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     size_t minRequiredDimension = gIsEmbedded ? 256 : 2048;
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D_ARRAY, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D_ARRAY,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
     /* Get the max image array width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image array size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image array size from device");
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
-        log_error( "ERROR: Reported max image array size is less than required! (%d)\n", (int)maxDimension );
+        log_error("ERROR: Reported max image array size is less than required! "
+                  "(%d)\n",
+                  (int)maxDimension);
         return -1;
     log_info("Max reported image array size is %ld.\n", maxDimension);
@@ -883,96 +1054,127 @@
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D_ARRAY, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D_ARRAY,
+                                   &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
-    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d_array( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] =
+        create_image_2d_array(context, CL_MEM_READ_ONLY, &image_format_desc, 1,
+                              1, maxDimension, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "2D Image Array creation failed for maximum array size" );
+        print_error(error,
+                    "2D Image Array creation failed for maximum array size");
         return -1;
     return 0;
-int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
     int error;
     size_t maxDimensionPixels;
     clMemWrapper streams[2];
-    cl_image_format image_format_desc = {0};
+    cl_image_format image_format_desc = { 0 };
     cl_ulong maxAllocSize;
     size_t minRequiredDimension = gIsEmbedded ? 2048 : 65536;
     unsigned int i = 0;
     size_t pixelBytes = 0;
     /* Get the max memory allocation size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
     /* Get the max image array width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof( maxDimensionPixels ), &maxDimensionPixels, NULL );
-    test_error( error, "Unable to get max image buffer size from device" );
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                        sizeof(maxDimensionPixels), &maxDimensionPixels, NULL);
+    test_error(error, "Unable to get max image buffer size from device");
-    if( maxDimensionPixels < minRequiredDimension )
+    if (maxDimensionPixels < minRequiredDimension)
-        log_error( "ERROR: Reported max image buffer size is less than required! (%d)\n", (int)maxDimensionPixels );
+        log_error("ERROR: Reported max image buffer size is less than "
+                  "required! (%d)\n",
+                  (int)maxDimensionPixels);
         return -1;
-    log_info("Max reported image buffer size is %ld pixels.\n", maxDimensionPixels);
+    log_info("Max reported image buffer size is %ld pixels.\n",
+             maxDimensionPixels);
     pixelBytes = maxAllocSize / maxDimensionPixels;
-    if ( pixelBytes == 0 )
+    if (pixelBytes == 0)
-        log_error( "Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image of maximum size!\n" );
+        log_error("Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than "
+                  "CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image "
+                  "of maximum size!\n");
         return -1;
     error = -1;
-    for ( i = pixelBytes; i > 0; --i )
+    for (i = pixelBytes; i > 0; --i)
-        error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE1D, CL_MEM_READ_ONLY, i, &image_format_desc );
-        if ( error == CL_SUCCESS )
+        error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE1D,
+                                       CL_MEM_READ_ONLY, i, &image_format_desc);
+        if (error == CL_SUCCESS)
             pixelBytes = i;
-    test_error( error, "Device does not support format to be used to allocate image of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n" );
+    test_error(error,
+               "Device does not support format to be used to allocate image of "
+               "CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n");
-    log_info("Attempting to create an 1D image with channel order %s from buffer of size %d = %gMB.\n",
-        GetChannelOrderName( image_format_desc.image_channel_order ), (int)maxDimensionPixels, ((float)maxDimensionPixels*pixelBytes/1024.0/1024.0));
+    log_info("Attempting to create an 1D image with channel order %s from "
+             "buffer of size %d = %gMB.\n",
+             GetChannelOrderName(image_format_desc.image_channel_order),
+             (int)maxDimensionPixels,
+             ((float)maxDimensionPixels * pixelBytes / 1024.0 / 1024.0));
     /* Try to allocate a buffer */
-    streams[0] = clCreateBuffer( context, CL_MEM_READ_ONLY, maxDimensionPixels*pixelBytes, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = clCreateBuffer(context, CL_MEM_READ_ONLY,
+                                maxDimensionPixels * pixelBytes, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "Buffer creation failed for maximum image buffer size" );
+        print_error(error,
+                    "Buffer creation failed for maximum image buffer size");
         return -1;
     /* Try to allocate a 1D image array from buffer */
-    streams[1] = create_image_1d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimensionPixels, 0, NULL, streams[0], &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[1] =
+        create_image_1d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                        maxDimensionPixels, 0, NULL, streams[0], &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
-        print_error( error, "1D Image from buffer creation failed for maximum image buffer size" );
+        print_error(error,
+                    "1D Image from buffer creation failed for maximum image "
+                    "buffer size");
         return -1;
@@ -980,8 +1182,8 @@
-int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_parameter_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error, retVal, i;
     size_t maxSize;
@@ -1000,62 +1202,78 @@
     /* Get the max param size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxSize ), &maxSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxSize), &maxSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
-    if( ((!gIsEmbedded) && (maxSize < 1024)) || ((gIsEmbedded) && (maxSize < 256)) )
+    if (((!gIsEmbedded) && (maxSize < 1024))
+        || ((gIsEmbedded) && (maxSize < 256)))
-        log_error( "ERROR: Reported max parameter size is less than required! (%d)\n", (int)maxSize );
+        log_error(
+            "ERROR: Reported max parameter size is less than required! (%d)\n",
+            (int)maxSize);
         return -1;
     /* The embedded profile without cles_khr_int64 extension does not require
      * longs, so use ints */
     if (embeddedNoLong)
-        numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_int);
+        numberOfIntParametersToTry = numberExpected =
+            (maxSize - sizeof(cl_mem)) / sizeof(cl_int);
-        numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_long);
+        numberOfIntParametersToTry = numberExpected =
+            (maxSize - sizeof(cl_mem)) / sizeof(cl_long);
-    decrement = (size_t)(numberOfIntParametersToTry/8);
-    if (decrement < 1)
-        decrement = 1;
+    decrement = (size_t)(numberOfIntParametersToTry / 8);
+    if (decrement < 1) decrement = 1;
     log_info("Reported max parameter size of %d bytes.\n", (int)maxSize);
-    while (numberOfIntParametersToTry > 0) {
-        // These need to be inside to be deallocated automatically on each loop iteration.
+    while (numberOfIntParametersToTry > 0)
+    {
+        // These need to be inside to be deallocated automatically on each loop
+        // iteration.
         clProgramWrapper program;
         clMemWrapper mem;
         clKernelWrapper kernel;
         if (embeddedNoLong)
-            log_info("Trying a kernel with %ld int arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n",
-                     numberOfIntParametersToTry, sizeof(cl_int)*numberOfIntParametersToTry, sizeof(cl_mem),
-                     sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int));
+            log_info(
+                "Trying a kernel with %ld int arguments (%ld bytes) and one "
+                "cl_mem (%ld bytes) for %ld bytes total.\n",
+                numberOfIntParametersToTry,
+                sizeof(cl_int) * numberOfIntParametersToTry, sizeof(cl_mem),
+                sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_int));
-            log_info("Trying a kernel with %ld long arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n",
-                     numberOfIntParametersToTry, sizeof(cl_long)*numberOfIntParametersToTry, sizeof(cl_mem),
-                     sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long));
+            log_info(
+                "Trying a kernel with %ld long arguments (%ld bytes) and one "
+                "cl_mem (%ld bytes) for %ld bytes total.\n",
+                numberOfIntParametersToTry,
+                sizeof(cl_long) * numberOfIntParametersToTry, sizeof(cl_mem),
+                sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_long));
         // Allocate memory for the program storage
-        data = malloc(sizeof(cl_long)*numberOfIntParametersToTry);
+        data = malloc(sizeof(cl_long) * numberOfIntParametersToTry);
-        argumentLine = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32);
-        codeLines = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32);
-        programSrc = (char*)malloc(sizeof(char)*(numberOfIntParametersToTry*64+1024));
+        argumentLine =
+            (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32);
+        codeLines =
+            (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32);
+        programSrc = (char *)malloc(sizeof(char)
+                                    * (numberOfIntParametersToTry * 64 + 1024));
         argumentLine[0] = '\0';
         codeLines[0] = '\0';
         programSrc[0] = '\0';
         // Generate our results
         expectedResult = 0;
-        for (i=0; i<(int)numberOfIntParametersToTry; i++)
-            {
-            if( gHasLong )
+        for (i = 0; i < (int)numberOfIntParametersToTry; i++)
+        {
+            if (gHasLong)
                 ((cl_long *)data)[i] = i;
                 expectedResult += i;
@@ -1068,30 +1286,35 @@
         // Build the program
-        if( gHasLong)
+        if (gHasLong)
             sprintf(argumentLine, "%s", "long arg0");
             sprintf(argumentLine, "%s", "int arg0");
         sprintf(codeLines, "%s", "result[0] += arg0;");
-        for (i=1; i<(int)numberOfIntParametersToTry; i++)
+        for (i = 1; i < (int)numberOfIntParametersToTry; i++)
-            if( gHasLong)
-                sprintf(argumentLine + strlen( argumentLine), ", long arg%d", i);
+            if (gHasLong)
+                sprintf(argumentLine + strlen(argumentLine), ", long arg%d", i);
-                sprintf(argumentLine + strlen( argumentLine), ", int arg%d", i);
+                sprintf(argumentLine + strlen(argumentLine), ", int arg%d", i);
-            sprintf(codeLines + strlen( codeLines), "\nresult[0] += arg%d;", i);
+            sprintf(codeLines + strlen(codeLines), "\nresult[0] += arg%d;", i);
         /* Create a kernel to test with */
-        sprintf( programSrc, gHasLong ?  sample_large_parmam_kernel_pattern[0]:
-                                        sample_large_int_parmam_kernel_pattern[0], argumentLine, codeLines);
+        sprintf(programSrc,
+                gHasLong ? sample_large_parmam_kernel_pattern[0]
+                         : sample_large_int_parmam_kernel_pattern[0],
+                argumentLine, codeLines);
         ptr = programSrc;
-        if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&ptr, "sample_test" ) != 0 )
+        if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&ptr, "sample_test")
+            != 0)
-            log_info("Create program failed, decrementing number of parameters to try.\n");
+            log_info("Create program failed, decrementing number of parameters "
+                     "to try.\n");
             numberOfIntParametersToTry -= decrement;
@@ -1103,88 +1326,119 @@
         test_error(error, "clCreateBuffer failed");
-        for (i=0; i<(int)numberOfIntParametersToTry; i++) {
-            if(gHasLong)
-                error = clSetKernelArg(kernel, i, sizeof(cl_long), &(((cl_long*)data)[i]));
+        for (i = 0; i < (int)numberOfIntParametersToTry; i++)
+        {
+            if (gHasLong)
+                error = clSetKernelArg(kernel, i, sizeof(cl_long),
+                                       &(((cl_long *)data)[i]));
-                error = clSetKernelArg(kernel, i, sizeof(cl_int), &(((cl_int*)data)[i]));
+                error = clSetKernelArg(kernel, i, sizeof(cl_int),
+                                       &(((cl_int *)data)[i]));
-            if (error != CL_SUCCESS) {
-                log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+            if (error != CL_SUCCESS)
+            {
+                log_info("clSetKernelArg failed (%s), decrementing number of "
+                         "parameters to try.\n",
+                         IGetErrorString(error));
                 numberOfIntParametersToTry -= decrement;
-        if (error != CL_SUCCESS)
-            continue;
+        if (error != CL_SUCCESS) continue;
         error = clSetKernelArg(kernel, i, sizeof(cl_mem), &mem);
-        if (error != CL_SUCCESS) {
-            log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+        if (error != CL_SUCCESS)
+        {
+            log_info("clSetKernelArg failed (%s), decrementing number of "
+                     "parameters to try.\n",
+                     IGetErrorString(error));
             numberOfIntParametersToTry -= decrement;
-        size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1};
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event);
-        if (error != CL_SUCCESS) {
-            log_info( "clEnqueueNDRangeKernel failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+        size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 };
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim,
+                                       localDim, 0, NULL, &event);
+        if (error != CL_SUCCESS)
+        {
+            log_info("clEnqueueNDRangeKernel failed (%s), decrementing number "
+                     "of parameters to try.\n",
+                     IGetErrorString(error));
             numberOfIntParametersToTry -= decrement;
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
         if (event_status < 0)
             test_error(error, "Kernel execution event returned error");
-        if(gHasLong)
-            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long), &long_result, 0, NULL, NULL);
+        if (gHasLong)
+            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long),
+                                        &long_result, 0, NULL, NULL);
-            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int), &int_result, 0, NULL, NULL);
+            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int),
+                                        &int_result, 0, NULL, NULL);
         test_error(error, "clEnqueueReadBuffer failed")
-        free(data);
+            free(data);
-        if(gHasLong)
+        if (gHasLong)
-            if (long_result != expectedResult) {
-                log_error("Expected result (%lld) does not equal actual result (%lld).\n", expectedResult, long_result);
+            if (long_result != expectedResult)
+            {
+                log_error("Expected result (%lld) does not equal actual result "
+                          "(%lld).\n",
+                          expectedResult, long_result);
                 numberOfIntParametersToTry -= decrement;
-            } else {
-                log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long));
+            }
+            else
+            {
+                log_info("Results verified at %ld bytes of arguments.\n",
+                         sizeof(cl_mem)
+                             + numberOfIntParametersToTry * sizeof(cl_long));
-            if (int_result != expectedResult) {
-                log_error("Expected result (%lld) does not equal actual result (%d).\n", expectedResult, int_result);
+            if (int_result != expectedResult)
+            {
+                log_error("Expected result (%lld) does not equal actual result "
+                          "(%d).\n",
+                          expectedResult, int_result);
                 numberOfIntParametersToTry -= decrement;
-            } else {
-                log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int));
+            }
+            else
+            {
+                log_info("Results verified at %ld bytes of arguments.\n",
+                         sizeof(cl_mem)
+                             + numberOfIntParametersToTry * sizeof(cl_int));
-    if (numberOfIntParametersToTry == (long)numberExpected)
-        return 0;
+    if (numberOfIntParametersToTry == (long)numberExpected) return 0;
     return -1;
-int test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_samplers(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
     int error;
     cl_uint maxSamplers, i;
@@ -1197,104 +1451,124 @@
     cl_uint minRequiredSamplers = gIsEmbedded ? 8 : 16;
     /* Get the max value */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_SAMPLERS, sizeof( maxSamplers ), &maxSamplers, NULL );
-    test_error( error, "Unable to get max sampler count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_SAMPLERS,
+                            sizeof(maxSamplers), &maxSamplers, NULL);
+    test_error(error, "Unable to get max sampler count from device");
-    if( maxSamplers < minRequiredSamplers )
+    if (maxSamplers < minRequiredSamplers)
-        log_error( "ERROR: Reported max sampler count is less than required! (%d)\n", (int)maxSamplers );
+        log_error(
+            "ERROR: Reported max sampler count is less than required! (%d)\n",
+            (int)maxSamplers);
         return -1;
     log_info("Reported max %d samplers.\n", maxSamplers);
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
     // Subtract the size of the result
-    maxParameterSize -= 2*sizeof(cl_mem);
+    maxParameterSize -= 2 * sizeof(cl_mem);
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_sampler) < maxSamplers) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max sampler arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_sampler)));
-        maxSamplers = (unsigned int)(maxParameterSize/sizeof(cl_sampler));
+    if (maxParameterSize / sizeof(cl_sampler) < maxSamplers)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max sampler arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_sampler)));
+        maxSamplers = (unsigned int)(maxParameterSize / sizeof(cl_sampler));
     /* Create a kernel to test with */
-    programSrc = (char *)malloc( ( strlen( sample_sampler_kernel_pattern[ 1 ] ) + 8 ) * ( maxSamplers ) +
-                                strlen( sample_sampler_kernel_pattern[ 0 ] ) + strlen( sample_sampler_kernel_pattern[ 2 ] ) +
-                                ( strlen( sample_sampler_kernel_pattern[ 3 ] ) + 8 ) * maxSamplers +
-                                strlen( sample_sampler_kernel_pattern[ 4 ] ) );
-    strcpy( programSrc, sample_sampler_kernel_pattern[ 0 ] );
-    for( i = 0; i < maxSamplers; i++ )
+    programSrc = (char *)malloc(
+        (strlen(sample_sampler_kernel_pattern[1]) + 8) * (maxSamplers)
+        + strlen(sample_sampler_kernel_pattern[0])
+        + strlen(sample_sampler_kernel_pattern[2])
+        + (strlen(sample_sampler_kernel_pattern[3]) + 8) * maxSamplers
+        + strlen(sample_sampler_kernel_pattern[4]));
+    strcpy(programSrc, sample_sampler_kernel_pattern[0]);
+    for (i = 0; i < maxSamplers; i++)
-        sprintf( samplerLine, sample_sampler_kernel_pattern[ 1 ], i );
-        strcat( programSrc, samplerLine );
+        sprintf(samplerLine, sample_sampler_kernel_pattern[1], i);
+        strcat(programSrc, samplerLine);
-    strcat( programSrc, sample_sampler_kernel_pattern[ 2 ] );
-    for( i = 0; i < maxSamplers; i++ )
+    strcat(programSrc, sample_sampler_kernel_pattern[2]);
+    for (i = 0; i < maxSamplers; i++)
-        sprintf( samplerLine, sample_sampler_kernel_pattern[ 3 ], i );
-        strcat( programSrc, samplerLine );
+        sprintf(samplerLine, sample_sampler_kernel_pattern[3], i);
+        strcat(programSrc, samplerLine);
-    strcat( programSrc, sample_sampler_kernel_pattern[ 4 ] );
+    strcat(programSrc, sample_sampler_kernel_pattern[4]);
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
     // We have to set up some fake parameters so it'll work
     clSamplerWrapper *samplers = new clSamplerWrapper[maxSamplers];
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    clMemWrapper image = create_image_2d( context, CL_MEM_READ_WRITE, &format, 16, 16, 0, NULL, &error );
-    test_error( error, "Unable to create a test image" );
+    clMemWrapper image = create_image_2d(context, CL_MEM_READ_WRITE, &format,
+                                         16, 16, 0, NULL, &error);
+    test_error(error, "Unable to create a test image");
     clMemWrapper stream =
         clCreateBuffer(context, CL_MEM_READ_WRITE, 16, NULL, &error);
-    test_error( error, "Unable to create test buffer" );
+    test_error(error, "Unable to create test buffer");
-    error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &image );
-    error |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &stream );
-    test_error( error, "Unable to set kernel arguments" );
-    for( i = 0; i < maxSamplers; i++ )
+    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
+    error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &stream);
+    test_error(error, "Unable to set kernel arguments");
+    for (i = 0; i < maxSamplers; i++)
-        samplers[ i ] = clCreateSampler( context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
-        test_error( error, "Unable to create sampler" );
+        samplers[i] = clCreateSampler(context, CL_FALSE, CL_ADDRESS_NONE,
+                                      CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
-        error = clSetKernelArg( kernel, 2 + i, sizeof( cl_sampler ), &samplers[ i ] );
-        test_error( error, "Unable to set sampler argument" );
+        error = clSetKernelArg(kernel, 2 + i, sizeof(cl_sampler), &samplers[i]);
+        test_error(error, "Unable to set sampler argument");
-    size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1};
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event);
-    test_error(error, "clEnqueueNDRangeKernel failed with maximum number of samplers.");
+    size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 };
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim,
+                                   0, NULL, &event);
+    test_error(
+        error,
+        "clEnqueueNDRangeKernel failed with maximum number of samplers.");
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
-    free( programSrc );
+    free(programSrc);
     delete[] samplers;
     return 0;
-int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    size_t    threads[1], localThreads[1];
+    size_t threads[1], localThreads[1];
     cl_int *constantData, *resultData;
     cl_ulong maxSize, stepSize, currentSize, maxGlobalSize, maxAllocSize;
     int i;
@@ -1303,48 +1577,56 @@
     MTdata d;
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max constant buffer size" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                            sizeof(maxSize), &maxSize, 0);
+    test_error(error, "Unable to get max constant buffer size");
-    if( ( 0 == gIsEmbedded && maxSize < 64L * 1024L ) || maxSize <  1L * 1024L )
+    if ((0 == gIsEmbedded && maxSize < 64L * 1024L) || maxSize < 1L * 1024L)
-        log_error( "ERROR: Reported max constant buffer size less than required by OpenCL 1.0 (reported %d KB)\n", (int)( maxSize / 1024L ) );
+        log_error("ERROR: Reported max constant buffer size less than required "
+                  "by OpenCL 1.0 (reported %d KB)\n",
+                  (int)(maxSize / 1024L));
         return -1;
     log_info("Reported max constant buffer size of %lld bytes.\n", maxSize);
     // Limit test buffer size to 1/8 of CL_DEVICE_GLOBAL_MEM_SIZE
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalSize), &maxGlobalSize, 0);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(maxGlobalSize), &maxGlobalSize, 0);
     test_error(error, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
-    if (maxSize > maxGlobalSize / 8)
-        maxSize = maxGlobalSize / 8;
+    if (maxSize > maxGlobalSize / 8) maxSize = maxGlobalSize / 8;
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(maxAllocSize), &maxAllocSize, 0);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, 0);
     test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE ");
-    if (maxSize > maxAllocSize)
-        maxSize = maxAllocSize;
+    if (maxSize > maxAllocSize) maxSize = maxAllocSize;
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_const_arg_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_const_arg_kernel, "sample_test")
+        != 0)
         return -1;
     /* Try the returned max size and decrease it until we get one that works. */
-    stepSize = maxSize/16;
+    stepSize = maxSize / 16;
     currentSize = maxSize;
     int allocPassed = 0;
-    d = init_genrand( gRandomSeed );
-    while (!allocPassed && currentSize >= maxSize/PASSING_FRACTION) {
-        log_info("Attempting to allocate constant buffer of size %lld bytes\n", maxSize);
+    d = init_genrand(gRandomSeed);
+    while (!allocPassed && currentSize >= maxSize / PASSING_FRACTION)
+    {
+        log_info("Attempting to allocate constant buffer of size %lld bytes\n",
+                 maxSize);
         /* Create some I/O streams */
-        size_t sizeToAllocate = ((size_t)currentSize/sizeof( cl_int ))*sizeof(cl_int);
-        size_t numberOfInts = sizeToAllocate/sizeof(cl_int);
-        constantData = (cl_int *)malloc( sizeToAllocate);
+        size_t sizeToAllocate =
+            ((size_t)currentSize / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData = (cl_int *)malloc(sizeToAllocate);
         if (constantData == NULL)
             log_error("Failed to allocate memory for constantData!\n");
@@ -1352,53 +1634,74 @@
             return EXIT_FAILURE;
-        for(i=0; i<(int)(numberOfInts); i++)
+        for (i = 0; i < (int)(numberOfInts); i++)
             constantData[i] = (int)genrand_int32(d);
         clMemWrapper streams[3];
         streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                     sizeToAllocate, constantData, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
         streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
                                     NULL, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
         /* Set the arguments */
-        error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
-        test_error( error, "Unable to set indexed kernel arguments" );
-        error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
-        test_error( error, "Unable to set indexed kernel arguments" );
+        error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
         /* Test running the kernel and verifying it */
         threads[0] = numberOfInts;
         localThreads[0] = 1;
-        log_info("Filling constant buffer with %d cl_ints (%d bytes).\n", (int)threads[0], (int)(threads[0]*sizeof(cl_int)));
+        log_info("Filling constant buffer with %d cl_ints (%d bytes).\n",
+                 (int)threads[0], (int)(threads[0] * sizeof(cl_int)));
-        error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event );
-        /* If we failed due to a resource issue, reduce the size and try again. */
-        if ((error == CL_OUT_OF_RESOURCES) || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (error == CL_OUT_OF_HOST_MEMORY)) {
-            log_info("Kernel enqueue failed at size %lld, trying at a reduced size.\n", currentSize);
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                       localThreads, 0, NULL, &event);
+        /* If we failed due to a resource issue, reduce the size and try again.
+         */
+        if ((error == CL_OUT_OF_RESOURCES)
+            || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE)
+            || (error == CL_OUT_OF_HOST_MEMORY))
+        {
+            log_info("Kernel enqueue failed at size %lld, trying at a reduced "
+                     "size.\n",
+                     currentSize);
             currentSize -= stepSize;
-        test_error( error, "clEnqueueNDRangeKernel with maximum constant buffer size failed.");
+        test_error(
+            error,
+            "clEnqueueNDRangeKernel with maximum constant buffer size failed.");
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
-        if (event_status < 0) {
-            if ((event_status == CL_OUT_OF_RESOURCES) || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (event_status == CL_OUT_OF_HOST_MEMORY)) {
-                log_info("Kernel event indicates failure at size %lld, trying at a reduced size.\n", currentSize);
+        if (event_status < 0)
+        {
+            if ((event_status == CL_OUT_OF_RESOURCES)
+                || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE)
+                || (event_status == CL_OUT_OF_HOST_MEMORY))
+            {
+                log_info("Kernel event indicates failure at size %lld, trying "
+                         "at a reduced size.\n",
+                         currentSize);
                 currentSize -= stepSize;
-            } else {
+            }
+            else
+            {
                 test_error(error, "Kernel execution event returned error");
@@ -1415,30 +1718,41 @@
             return EXIT_FAILURE;
-        error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL);
-        test_error( error, "clEnqueueReadBuffer failed");
+        error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                    sizeToAllocate, resultData, 0, NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
-        for(i=0; i<(int)(numberOfInts); i++)
-            if (constantData[i] != resultData[i]) {
-                log_error("Data failed to verify: constantData[%d]=%d != resultData[%d]=%d\n",
+        for (i = 0; i < (int)(numberOfInts); i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
                           i, constantData[i], i, resultData[i]);
-                free( constantData );
+                free(constantData);
-                free_mtdata(d);   d = NULL;
+                free_mtdata(d);
+                d = NULL;
                 return -1;
-        free( constantData );
+        free(constantData);
-    free_mtdata(d);   d = NULL;
+    free_mtdata(d);
+    d = NULL;
-    if (allocPassed) {
-        if (currentSize < maxSize/PASSING_FRACTION) {
-            log_error("Failed to allocate at least 1/8 of the reported constant size.\n");
+    if (allocPassed)
+    {
+        if (currentSize < maxSize / PASSING_FRACTION)
+        {
+            log_error("Failed to allocate at least 1/8 of the reported "
+                      "constant size.\n");
             return -1;
-        } else if (currentSize != maxSize) {
-            log_info("Passed at reduced size. (%lld of %lld bytes)\n", currentSize, maxSize);
+        }
+        else if (currentSize != maxSize)
+        {
+            log_info("Passed at reduced size. (%lld of %lld bytes)\n",
+                     currentSize, maxSize);
             return 0;
         return 0;
@@ -1446,13 +1760,14 @@
     return -1;
-int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_constant_args(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper    *streams;
-    size_t    threads[1], localThreads[1];
+    clMemWrapper *streams;
+    size_t threads[1], localThreads[1];
     cl_uint i, maxArgs;
     cl_ulong maxSize;
     cl_ulong maxParameterSize;
@@ -1465,119 +1780,145 @@
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof( maxArgs ), &maxArgs, 0 );
-    test_error( error, "Unable to get max constant arg count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_ARGS,
+                            sizeof(maxArgs), &maxArgs, 0);
+    test_error(error, "Unable to get max constant arg count");
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
     // Subtract the size of the result
     maxParameterSize -= sizeof(cl_mem);
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_mem) < maxArgs) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem)));
-        maxArgs = (unsigned int)(maxParameterSize/sizeof(cl_mem));
+    if (maxParameterSize / sizeof(cl_mem) < maxArgs)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_mem)));
+        maxArgs = (unsigned int)(maxParameterSize / sizeof(cl_mem));
-    if( maxArgs < (gIsEmbedded ? 4 : 8) )
+    if (maxArgs < (gIsEmbedded ? 4 : 8))
-        log_error( "ERROR: Reported max constant arg count less than required by OpenCL 1.0 (reported %d)\n", (int)maxArgs );
+        log_error("ERROR: Reported max constant arg count less than required "
+                  "by OpenCL 1.0 (reported %d)\n",
+                  (int)maxArgs);
         return -1;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max constant buffer size" );
-    individualBufferSize = ((int)maxSize/2)/maxArgs;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                            sizeof(maxSize), &maxSize, 0);
+    test_error(error, "Unable to get max constant buffer size");
+    individualBufferSize = (maxSize / 2) / maxArgs;
-    log_info("Reported max constant arg count of %d and max constant buffer size of %d. Test will attempt to allocate half of that, or %d buffers of size %d.\n",
-             (int)maxArgs, (int)maxSize, (int)maxArgs, (int)individualBufferSize);
+    log_info(
+        "Reported max constant arg count of %u and max constant buffer "
+        "size of %llu. Test will attempt to allocate half of that, or %llu "
+        "buffers of size %zu.\n",
+        maxArgs, maxSize, maxArgs, individualBufferSize);
-    str2 = (char*)malloc(sizeof(char)*32*(maxArgs+2));
-    constArgs = (char*)malloc(sizeof(char)*32*(maxArgs+2));
-    programSrc = (char*)malloc(sizeof(char)*32*2*(maxArgs+2)+1024);
+    str2 = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2));
+    constArgs = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2));
+    programSrc = (char *)malloc(sizeof(char) * 32 * 2 * (maxArgs + 2) + 1024);
     /* Create a test program */
     constArgs[0] = 0;
     str2[0] = 0;
-    for( i = 0; i < maxArgs-1; i++ )
+    for (i = 0; i < maxArgs - 1; i++)
-        sprintf( str, ", __constant int *src%d", (int)( i + 2 ) );
-        strcat( constArgs, str );
-        sprintf( str2 + strlen( str2), "\tdst[tid] += src%d[tid];\n", (int)(i+2));
-        if (strlen(str2) > (sizeof(char)*32*(maxArgs+2)-32) || strlen(constArgs) > (sizeof(char)*32*(maxArgs+2)-32)) {
-            log_info("Limiting number of arguments tested to %d due to test program allocation size.\n", i);
+        sprintf(str, ", __constant int *src%d", (int)(i + 2));
+        strcat(constArgs, str);
+        sprintf(str2 + strlen(str2), "\tdst[tid] += src%d[tid];\n",
+                (int)(i + 2));
+        if (strlen(str2) > (sizeof(char) * 32 * (maxArgs + 2) - 32)
+            || strlen(constArgs) > (sizeof(char) * 32 * (maxArgs + 2) - 32))
+        {
+            log_info("Limiting number of arguments tested to %d due to test "
+                     "program allocation size.\n",
+                     i);
-    sprintf( programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2 );
+    sprintf(programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2);
     /* Create a kernel to test with */
     ptr = programSrc;
-    if( create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                    "sample_test")
+        != 0)
         return -1;
     /* Create some I/O streams */
-    streams = new clMemWrapper[ maxArgs + 1 ];
-    for( i = 0; i < maxArgs + 1; i++ )
+    streams = new clMemWrapper[maxArgs + 1];
+    for (i = 0; i < maxArgs + 1; i++)
         streams[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                     individualBufferSize, NULL, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
     /* Set the arguments */
-    for( i = 0; i < maxArgs + 1; i++ )
+    for (i = 0; i < maxArgs + 1; i++)
-        error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
-        test_error( error, "Unable to set kernel argument" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel argument");
     /* Test running the kernel and verifying it */
     threads[0] = (size_t)10;
-    while (threads[0]*sizeof(cl_int) > individualBufferSize)
-        threads[0]--;
+    while (threads[0] * sizeof(cl_int) > individualBufferSize) threads[0]--;
-    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error = get_max_common_work_group_size(context, kernel, threads[0],
+                                           &localThreads[0]);
+    test_error(error, "Unable to get work group size to use");
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed");
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed");
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
     error = clFinish(queue);
-    test_error( error, "clFinish failed.");
+    test_error(error, "clFinish failed.");
-    delete [] streams;
+    delete[] streams;
     return 0;
-int test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_compute_units(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
     int error;
     cl_uint value;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get compute unit count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get compute unit count");
-    if( value < 1 )
+    if (value < 1)
-        log_error( "ERROR: Reported compute unit count less than required by OpenCL 1.0 (reported %d)\n", (int)value );
+        log_error("ERROR: Reported compute unit count less than required by "
+                  "OpenCL 1.0 (reported %d)\n",
+                  (int)value);
         return -1;
@@ -1586,18 +1927,22 @@
     return 0;
-int test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_address_bits(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
     int error;
     cl_uint value;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get address bit count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get address bit count");
-    if( value != 32 && value != 64 )
+    if (value != 32 && value != 64)
-        log_error( "ERROR: Reported address bit count not valid by OpenCL 1.0 (reported %d)\n", (int)value );
+        log_error("ERROR: Reported address bit count not valid by OpenCL 1.0 "
+                  "(reported %d)\n",
+                  (int)value);
         return -1;
@@ -1606,68 +1951,84 @@
     return 0;
-int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     cl_device_fp_config value;
     char profile[128] = "";
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get device single fp config" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get device single fp config");
-    //Check to see if we are an embedded profile device
-    if((error = clGetDeviceInfo( deviceID, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL )))
+    // Check to see if we are an embedded profile device
+    if ((error = clGetDeviceInfo(deviceID, CL_DEVICE_PROFILE, sizeof(profile),
+                                 profile, NULL)))
-        log_error( "FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n", error );
+        log_error("FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n",
+                  error);
         return error;
-    if( 0 == strcmp( profile, "EMBEDDED_PROFILE" ))
+    if (0 == strcmp(profile, "EMBEDDED_PROFILE"))
     { // embedded device
-        if( 0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO)))
+        if (0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO)))
-            log_error( "FAILURE: embedded device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
+            log_error("FAILURE: embedded device supports neither "
+                      "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
             return -1;
     { // Full profile
-        if( ( value & ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN )) != ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN ) )
+        if ((value & (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN))
+            != (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN))
-            log_error( "ERROR: Reported single fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+            log_error("ERROR: Reported single fp config doesn't meet minimum "
+                      "set by OpenCL 1.0 (reported 0x%08x)\n",
+                      (int)value);
             return -1;
     return 0;
-int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     cl_device_fp_config value;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get device double fp config" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get device double fp config");
-    if (value == 0)
-        return 0;
+    if (value == 0) return 0;
+    if ((value
-        log_error( "ERROR: Reported double fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported double fp config doesn't meet minimum set "
+                  "by OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     return 0;
-int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper            streams[3];
-    size_t    threads[1], localThreads[1];
+    clMemWrapper streams[3];
+    size_t threads[1], localThreads[1];
     cl_int *localData, *resultData;
     cl_ulong maxSize, kernelLocalUsage, min_max_local_mem_size;
     Version device_version;
@@ -1676,8 +2037,9 @@
     MTdata d;
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max local buffer size" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxSize),
+                            &maxSize, 0);
+    test_error(error, "Unable to get max local buffer size");
@@ -1709,65 +2071,80 @@
         return -1;
-    log_info("Reported max local buffer size for device: %lld bytes.\n", maxSize);
+    log_info("Reported max local buffer size for device: %lld bytes.\n",
+             maxSize);
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_local_arg_kernel, "sample_test")
+        != 0)
         return -1;
-    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalUsage), &kernelLocalUsage, NULL);
-    test_error(error, "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed");
+    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                                     sizeof(kernelLocalUsage),
+                                     &kernelLocalUsage, NULL);
+    test_error(error,
+               "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed");
-    log_info("Reported local buffer usage for kernel (CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n", kernelLocalUsage);
+    log_info("Reported local buffer usage for kernel "
+             "(CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n",
+             kernelLocalUsage);
     /* Create some I/O streams */
-    size_t sizeToAllocate = ((size_t)(maxSize-kernelLocalUsage)/sizeof( cl_int ))*sizeof(cl_int);
-    size_t numberOfInts = sizeToAllocate/sizeof(cl_int);
+    size_t sizeToAllocate =
+        ((size_t)(maxSize - kernelLocalUsage) / sizeof(cl_int))
+        * sizeof(cl_int);
+    size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
-    log_info("Attempting to use %lld bytes of local memory.\n", (cl_ulong)sizeToAllocate);
+    log_info("Attempting to use %zu bytes of local memory.\n", sizeToAllocate);
-    localData = (cl_int *)malloc( sizeToAllocate );
-    d = init_genrand( gRandomSeed );
-    for(i=0; i<(int)(numberOfInts); i++)
+    localData = (cl_int *)malloc(sizeToAllocate);
+    d = init_genrand(gRandomSeed);
+    for (i = 0; i < (int)(numberOfInts); i++)
         localData[i] = (int)genrand_int32(d);
-    free_mtdata(d); d = NULL;
+    free_mtdata(d);
+    d = NULL;
     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
                                 localData, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
                                 NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     /* Set the arguments */
     error = clSetKernelArg(kernel, 0, sizeToAllocate, NULL);
-    test_error( error, "Unable to set indexed kernel arguments" );
-    error = clSetKernelArg(kernel, 1, sizeof( streams[0] ), &streams[0]);
-    test_error( error, "Unable to set indexed kernel arguments" );
-    error = clSetKernelArg(kernel, 2, sizeof( streams[1] ), &streams[1]);
-    test_error( error, "Unable to set indexed kernel arguments" );
+    test_error(error, "Unable to set indexed kernel arguments");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set indexed kernel arguments");
+    error = clSetKernelArg(kernel, 2, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set indexed kernel arguments");
     /* Test running the kernel and verifying it */
     threads[0] = numberOfInts;
     localThreads[0] = 1;
-    log_info("Creating local buffer with %d cl_ints (%d bytes).\n", (int)numberOfInts, (int)sizeToAllocate);
+    log_info("Creating local buffer with %zu cl_ints (%zu bytes).\n",
+             numberOfInts, sizeToAllocate);
     cl_event evt;
-    cl_int   evt_err;
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &evt );
+    cl_int evt_err;
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &evt);
     test_error(error, "clEnqueueNDRangeKernel failed");
     error = clFinish(queue);
-    test_error( error, "clFinish failed");
+    test_error(error, "clFinish failed");
-    error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof evt_err, &evt_err, NULL);
-    test_error( error, "clGetEventInfo with maximum local buffer size failed.");
+    error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof evt_err, &evt_err, NULL);
+    test_error(error, "clGetEventInfo with maximum local buffer size failed.");
-    if (evt_err != CL_COMPLETE) {
+    if (evt_err != CL_COMPLETE)
+    {
         print_error(evt_err, "Kernel event returned error");
         return -1;
@@ -1775,95 +2152,118 @@
     resultData = (cl_int *)malloc(sizeToAllocate);
-    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL);
-    test_error( error, "clEnqueueReadBuffer failed");
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate,
+                                resultData, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
-    for(i=0; i<(int)(numberOfInts); i++)
-        if (localData[i] != resultData[i]) {
+    for (i = 0; i < (int)(numberOfInts); i++)
+        if (localData[i] != resultData[i])
+        {
-            free( localData );
+            free(localData);
             log_error("Results failed to verify.\n");
             return -1;
-    free( localData );
+    free(localData);
     return err;
-int test_min_max_kernel_preferred_work_group_size_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_kernel_preferred_work_group_size_multiple(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
-    int                err;
+    int err;
     clProgramWrapper program;
     clKernelWrapper kernel;
     size_t max_local_workgroup_size[3];
     size_t max_workgroup_size = 0, preferred_workgroup_size = 0;
-    err = create_single_kernel_helper(context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      sample_local_arg_kernel, "sample_test");
     test_error(err, "Failed to build kernel/program.");
     err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE,
-                                   sizeof(max_workgroup_size), &max_workgroup_size, NULL);
+                                   sizeof(max_workgroup_size),
+                                   &max_workgroup_size, NULL);
     test_error(err, "clGetKernelWorkgroupInfo failed.");
-    err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                   sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
+    err = clGetKernelWorkGroupInfo(
+        sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
     test_error(err, "clGetKernelWorkgroupInfo failed.");
-    err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                          sizeof(max_local_workgroup_size),
+                          max_local_workgroup_size, NULL);
     test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
-    // Since the preferred size is only a performance hint, we can only really check that we get a sane value
-    // back
-    log_info( "size: %ld     preferred: %ld      max: %ld\n", max_workgroup_size, preferred_workgroup_size, max_local_workgroup_size[0] );
+    // Since the preferred size is only a performance hint, we can only really
+    // check that we get a sane value back
+    log_info("size: %ld     preferred: %ld      max: %ld\n", max_workgroup_size,
+             preferred_workgroup_size, max_local_workgroup_size[0]);
-    if( preferred_workgroup_size > max_workgroup_size )
+    if (preferred_workgroup_size > max_workgroup_size)
-        log_error( "ERROR: Reported preferred workgroup multiple larger than max workgroup size (preferred %ld, max %ld)\n", preferred_workgroup_size, max_workgroup_size );
+        log_error("ERROR: Reported preferred workgroup multiple larger than "
+                  "max workgroup size (preferred %ld, max %ld)\n",
+                  preferred_workgroup_size, max_workgroup_size);
         return -1;
     return 0;
-int test_min_max_execution_capabilities(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_execution_capabilities(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
     int error;
     cl_device_exec_capabilities value;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get execution capabilities" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXECUTION_CAPABILITIES,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get execution capabilities");
-    if( ( value & CL_EXEC_KERNEL ) != CL_EXEC_KERNEL )
+    if ((value & CL_EXEC_KERNEL) != CL_EXEC_KERNEL)
-        log_error( "ERROR: Reported execution capabilities less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported execution capabilities less than required "
+                  "by OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     return 0;
-int test_min_max_queue_properties(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_queue_properties(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     int error;
     cl_command_queue_properties value;
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get queue properties" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get queue properties");
-        log_error( "ERROR: Reported queue properties less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported queue properties less than required by "
+                  "OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     return 0;
-int test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_device_version(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     // Query for the device version.
     Version device_cl_version = get_device_cl_version(deviceID);
@@ -1959,84 +2359,101 @@
     return 0;
-int test_min_max_language_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_language_version(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     cl_int error;
-    cl_char buffer[ 4098 ];
+    cl_char buffer[4098];
     size_t length;
     // Device version should fit the regex "OpenCL [0-9]+\.[0-9]+ *.*"
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_OPENCL_C_VERSION, sizeof( buffer ), buffer, &length );
-    test_error( error, "Unable to get device opencl c version string" );
-    if( memcmp( buffer, "OpenCL C ", strlen( "OpenCL C " ) ) != 0 )
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_OPENCL_C_VERSION,
+                            sizeof(buffer), buffer, &length);
+    test_error(error, "Unable to get device opencl c version string");
+    if (memcmp(buffer, "OpenCL C ", strlen("OpenCL C ")) != 0)
-        log_error( "ERROR: Initial part of device language version string does not match required format! (returned: \"%s\")\n", (char *)buffer );
+        log_error("ERROR: Initial part of device language version string does "
+                  "not match required format! (returned: \"%s\")\n",
+                  (char *)buffer);
         return -1;
     log_info("Returned version \"%s\".\n", buffer);
-    char *p1 = (char *)buffer + strlen( "OpenCL C " );
-    while( *p1 == ' ' )
-        p1++;
+    char *p1 = (char *)buffer + strlen("OpenCL C ");
+    while (*p1 == ' ') p1++;
     char *p2 = p1;
-    if( ! isdigit(*p2) )
+    if (!isdigit(*p2))
-        log_error( "ERROR: Major revision number must follow space behind OpenCL C! (returned %s)\n", (char*) buffer );
+        log_error("ERROR: Major revision number must follow space behind "
+                  "OpenCL C! (returned %s)\n",
+                  (char *)buffer);
         return -1;
-    while( isdigit( *p2 ) )
-        p2++;
-    if( *p2 != '.' )
+    while (isdigit(*p2)) p2++;
+    if (*p2 != '.')
-        log_error( "ERROR: Version number must contain a decimal point! (returned: %s)\n", (char *)buffer );
+        log_error("ERROR: Version number must contain a decimal point! "
+                  "(returned: %s)\n",
+                  (char *)buffer);
         return -1;
     char *p3 = p2 + 1;
-    if( ! isdigit(*p3) )
+    if (!isdigit(*p3))
-        log_error( "ERROR: Minor revision number is missing or does not abut the decimal point! (returned %s)\n", (char*) buffer );
+        log_error("ERROR: Minor revision number is missing or does not abut "
+                  "the decimal point! (returned %s)\n",
+                  (char *)buffer);
         return -1;
-    while( isdigit( *p3 ) )
-        p3++;
-    if( *p3 != ' ' )
+    while (isdigit(*p3)) p3++;
+    if (*p3 != ' ')
-        log_error( "ERROR: A space must appear after the minor version! (returned: %s)\n", (char *)buffer );
+        log_error("ERROR: A space must appear after the minor version! "
+                  "(returned: %s)\n",
+                  (char *)buffer);
         return -1;
     *p2 = ' '; // Put in a space for atoi below.
-    int major = atoi( p1 );
-    int minor = atoi( p2 );
+    int major = atoi(p1);
+    int minor = atoi(p2);
     int minor_revision = 2;
-    if( major * 10 + minor < 10 + minor_revision )
+    if (major * 10 + minor < 10 + minor_revision)
-        // If the language version did not match, check to see if OPENCL_1_0_DEVICE is set.
-        if( getenv("OPENCL_1_0_DEVICE"))
+        // If the language version did not match, check to see if
+        // OPENCL_1_0_DEVICE is set.
+        if (getenv("OPENCL_1_0_DEVICE"))
-          log_info( "WARNING: This test was run with OPENCL_1_0_DEVICE defined!  This is not a OpenCL 1.1 or OpenCL 1.2 compatible device!!!\n" );
+            log_info("WARNING: This test was run with OPENCL_1_0_DEVICE "
+                     "defined!  This is not a OpenCL 1.1 or OpenCL 1.2 "
+                     "compatible device!!!\n");
-        else if( getenv("OPENCL_1_1_DEVICE"))
+        else if (getenv("OPENCL_1_1_DEVICE"))
-          log_info( "WARNING: This test was run with OPENCL_1_1_DEVICE defined!  This is not a OpenCL 1.2 compatible device!!!\n" );
+            log_info(
+                "WARNING: This test was run with OPENCL_1_1_DEVICE defined!  "
+                "This is not a OpenCL 1.2 compatible device!!!\n");
-          log_error( "ERROR: OpenCL device language version returned is less than 1.%d! (Returned: %s)\n", minor_revision, (char *)buffer );
-          return -1;
+            log_error("ERROR: OpenCL device language version returned is less "
+                      "than 1.%d! (Returned: %s)\n",
+                      minor_revision, (char *)buffer);
+            return -1;
     // Sanity checks on the returned values
-    if( length != (strlen( (char *)buffer ) + 1 ))
+    if (length != (strlen((char *)buffer) + 1))
-        log_error( "ERROR: Returned length of version string does not match actual length (actual: %d, returned: %d)\n", (int)strlen( (char *)buffer ), (int)length );
+        log_error("ERROR: Returned length of version string does not match "
+                  "actual length (actual: %d, returned: %d)\n",
+                  (int)strlen((char *)buffer), (int)length);
         return -1;
     return 0;
diff --git a/test_conformance/api/test_context_destructor_callback.cpp b/test_conformance/api/test_context_destructor_callback.cpp
index 1d73a3c..d29d903 100644
--- a/test_conformance/api/test_context_destructor_callback.cpp
+++ b/test_conformance/api/test_context_destructor_callback.cpp
@@ -52,12 +52,7 @@
     test_error(error, "Unable to set destructor callback");
     // Now release the context, which SHOULD call the callbacks
-    error = clReleaseContext(localContext);
-    test_error(error, "Unable to release local context");
-    // Note: since we manually released the context, we need to set it to NULL
-    // to prevent a double-release
-    localContext = NULL;
+    localContext.reset();
     // At this point, all three callbacks should have already been called
     int numErrors = 0;
diff --git a/test_conformance/api/test_kernel_arg_info.cpp b/test_conformance/api/test_kernel_arg_info.cpp
index 8073e0d..d0681df 100644
--- a/test_conformance/api/test_kernel_arg_info.cpp
+++ b/test_conformance/api/test_kernel_arg_info.cpp
@@ -22,11 +22,8 @@
 #define MINIMUM_OPENCL_PIPE_VERSION Version(2, 0)
-static constexpr size_t CL_VERSION_LENGTH = 128;
 static constexpr size_t KERNEL_ARGUMENT_LENGTH = 128;
 static constexpr char KERNEL_ARGUMENT_NAME[] = "argument";
-static constexpr size_t KERNEL_ARGUMENT_NAME_LENGTH =
-    sizeof(KERNEL_ARGUMENT_NAME) + 1;
 static constexpr int SINGLE_KERNEL_ARG_NUMBER = 0;
 static constexpr int MAX_NUMBER_OF_KERNEL_ARGS = 128;
@@ -167,7 +164,8 @@
 /* This function generates a kernel source and allows for multiple arguments to
  * be passed in and subsequently queried. */
 static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args,
-                                   const bool supports_3d_image_writes = false)
+                                   const bool supports_3d_image_writes = false,
+                                   const bool kernel_uses_half_type = false)
     std::string ret;
@@ -175,10 +173,13 @@
         ret += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable\n";
+    if (kernel_uses_half_type)
+    {
+        ret += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+    }
     ret += "kernel void get_kernel_arg_info(\n";
     for (int i = 0; i < all_args.size(); ++i)
-        const KernelArgInfo& arg = all_args[i];
         ret += generate_argument(all_args[i]);
         if (i == all_args.size() - 1)
@@ -537,6 +538,7 @@
         cl_int err = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS,
                                      &device_address_bits, NULL);
+        test_error_ret(err, "clGetDeviceInfo", 0);
         return (device_address_bits / 8);
@@ -673,8 +675,8 @@
                     if (param_size + total_param_size >= max_param_size
                         || all_args.size() == MAX_NUMBER_OF_KERNEL_ARGS)
-                        const std::string kernel_src =
-                            generate_kernel(all_args);
+                        const std::string kernel_src = generate_kernel(
+                            all_args, false, device_supports_half(deviceID));
                         failed_tests += compare_kernel_with_expected(
                             context, deviceID, kernel_src.c_str(),
@@ -696,7 +698,8 @@
-    const std::string kernel_src = generate_kernel(all_args);
+    const std::string kernel_src =
+        generate_kernel(all_args, false, device_supports_half(deviceID));
     failed_tests += compare_kernel_with_expected(
         context, deviceID, kernel_src.c_str(), expected_args);
     return failed_tests;
@@ -808,8 +811,34 @@
     cl_kernel_arg_address_qualifier address_qualifier =
+    Version version = get_device_cl_version(deviceID);
+    bool supports_read_write_images = false;
+    if (version >= Version(3, 0))
+    {
+        cl_uint maxReadWriteImageArgs = 0;
+        cl_int error = clGetDeviceInfo(
+            sizeof(maxReadWriteImageArgs), &maxReadWriteImageArgs, NULL);
+        test_error(error,
+                   "Unable to query "
+                   "CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS");
+        // read-write images are supported if MAX_READ_WRITE_IMAGE_ARGS is
+        // nonzero
+        supports_read_write_images = maxReadWriteImageArgs != 0;
+    }
+    else if (version >= Version(2, 0))
+    {
+        // read-write images are required for OpenCL 2.x
+        supports_read_write_images = true;
+    }
     for (auto access_qualifier : access_qualifiers)
+        if (access_qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE
+            && !supports_read_write_images)
+            continue;
         bool is_write =
             (access_qualifier == CL_KERNEL_ARG_ACCESS_WRITE_ONLY
              || access_qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE);
diff --git a/test_conformance/api/test_kernel_attributes.cpp b/test_conformance/api/test_kernel_attributes.cpp
index 2e4e0a7..ad4baa0 100644
--- a/test_conformance/api/test_kernel_attributes.cpp
+++ b/test_conformance/api/test_kernel_attributes.cpp
@@ -275,16 +275,16 @@
         clKernelWrapper kernel;
         cl_int err = create_single_kernel_helper(context, &program, &kernel, 1,
                                                  &kernel_src, "test_kernel");
-        test_error(err, "create_single_kernel_helper");
+        test_error_ret(err, "create_single_kernel_helper", false);
         // Get the size of the kernel attribute string returned
         size_t size = 0;
         err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, nullptr, &size);
-        test_error(err, "clGetKernelInfo");
+        test_error_ret(err, "clGetKernelInfo", false);
         std::vector<char> attributes(size);
         err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, attributes.size(),
                     , nullptr);
-        test_error(err, "clGetKernelInfo");
+        test_error_ret(err, "clGetKernelInfo", false);
         std::string attribute_string(;
             std::remove(attribute_string.begin(), attribute_string.end(), ' '),
diff --git a/test_conformance/api/test_mem_object_info.cpp b/test_conformance/api/test_mem_object_info.cpp
index ccfeaaf..8dc8f6c 100644
--- a/test_conformance/api/test_mem_object_info.cpp
+++ b/test_conformance/api/test_mem_object_info.cpp
@@ -348,14 +348,7 @@
             TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_ASSOCIATED_MEMOBJECT, origObj, (cl_mem)bufferObject, "associated mem object", "%p", void * )
             TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_OFFSET, offset, (size_t)( addressAlign ), "offset", "%ld", size_t )
-            clReleaseMemObject( subBufferObject );
-            subBufferObject = NULL;
-        clReleaseMemObject( bufferObject );
-        bufferObject = NULL;
     return CL_SUCCESS;
@@ -370,8 +363,6 @@
     cl_mem_flags flags;
     cl_uint mapCount;
     cl_uint refCount;
-    size_t rowPitchMultiplier;
-    size_t slicePitchMultiplier;
     cl_context otherCtx;
     size_t offset;
     size_t sz;
diff --git a/test_conformance/api/test_mem_objects.cpp b/test_conformance/api/test_mem_objects.cpp
index c29613f..f1a4e99 100644
--- a/test_conformance/api/test_mem_objects.cpp
+++ b/test_conformance/api/test_mem_objects.cpp
@@ -48,12 +48,7 @@
     test_error(error, "Unable to set destructor callback");
     // Now release the buffer, which SHOULD call the callbacks
-    error = clReleaseMemObject(memObject);
-    test_error(error, "Unable to release test buffer");
-    // Note: since we manually released the mem wrapper, we need to set it to
-    // NULL to prevent a double-release
-    memObject = NULL;
+    memObject.reset();
     // At this point, all three callbacks should have already been called
     int numErrors = 0;
diff --git a/test_conformance/api/test_null_buffer_arg.cpp b/test_conformance/api/test_null_buffer_arg.cpp
index d412d4e..75bdd47 100644
--- a/test_conformance/api/test_null_buffer_arg.cpp
+++ b/test_conformance/api/test_null_buffer_arg.cpp
@@ -149,7 +149,6 @@
     cl_command_queue queue, int num_elements)
     unsigned int test_success = 0;
-    unsigned int i;
     unsigned int buffer_size;
     cl_int status;
     cl_program program;
diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index 469a193..a7703a7 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <ctype.h>
 #include <algorithm>
+#include <vector>
 int test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
@@ -345,87 +346,100 @@
     return 0;
-static cl_command_queue_properties property_options[] = {
-    0,
 int check_get_command_queue_info_params(cl_device_id deviceID,
                                         cl_context context,
                                         bool is_compatibility)
-    int error;
-    size_t size;
+    const cl_command_queue_properties host_optional[] = {
+    };
-    cl_queue_properties host_queue_props, device_queue_props;
-    cl_queue_properties queue_props[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+    const cl_command_queue_properties device_required[] = {
+    };
-                    sizeof(host_queue_props), &host_queue_props, NULL);
-    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
-             (int)host_queue_props);
-                    sizeof(device_queue_props), &device_queue_props, NULL);
-    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
-             (int)device_queue_props);
+    const size_t host_optional_size = ARRAY_SIZE(host_optional);
+    const size_t device_required_size = ARRAY_SIZE(device_required);
-    auto version = get_device_cl_version(deviceID);
+    Version version = get_device_cl_version(deviceID);
-    // Are on device queues supported
+    const cl_device_info host_queue_query = version >= Version(2, 0)
+    cl_queue_properties host_queue_props = 0;
+    int error =
+        clGetDeviceInfo(deviceID, host_queue_query, sizeof(host_queue_props),
+                        &host_queue_props, NULL);
+    test_error(error, "clGetDeviceInfo failed");
+    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", host_queue_props);
+    cl_queue_properties device_queue_props = 0;
+    if (version >= Version(2, 0))
+    {
+        error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
+                                sizeof(device_queue_props), &device_queue_props,
+                                NULL);
+        test_error(error, "clGetDeviceInfo failed");
+        log_info("CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES is %d\n",
+                 device_queue_props);
+    }
+    bool out_of_order_supported =
+        host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
     bool on_device_supported =
         (version >= Version(2, 0) && version < Version(3, 0))
         || (version >= Version(3, 0) && device_queue_props != 0);
-    int num_test_options = MIN_NUM_COMMAND_QUEUE_PROPERTIES;
-    if (host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
-    {
-        // Test out-of-order queues properties if supported
-        num_test_options = OOO_NUM_COMMAND_QUEUE_PROPERTIES;
-    }
-    if (on_device_supported && !is_compatibility)
-    {
-        // Test queue on device if supported (in this case out-of-order must
-        // also be supported)
-        num_test_options = ARRAY_SIZE(property_options);
-    }
+    // test device queues if the device and the API under test support it
+    bool test_on_device = on_device_supported && !is_compatibility;
-    for (int i = 0; i < num_test_options; i++)
+    std::vector<cl_queue_properties> queue_props{ 0,
+                                                  CL_QUEUE_PROFILING_ENABLE };
+    if (out_of_order_supported)
-        queue_props[1] = property_options[i];
+        queue_props.insert(queue_props.end(), &host_optional[0],
+                           &host_optional[host_optional_size]);
+    };
+    cl_queue_properties queue_props_arg[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+    if (test_on_device)
+    {
+        queue_props.insert(queue_props.end(), &device_required[0],
+                           &device_required[device_required_size]);
+    };
+    for (cl_queue_properties props : queue_props)
+    {
+        queue_props_arg[1] = props;
         clCommandQueueWrapper queue;
         if (is_compatibility)
-            queue =
-                clCreateCommandQueue(context, deviceID, queue_props[1], &error);
+            queue = clCreateCommandQueue(context, deviceID, props, &error);
             test_error(error, "Unable to create command queue to test with");
             queue = clCreateCommandQueueWithProperties(context, deviceID,
-                                                       &queue_props[0], &error);
+                                                       queue_props_arg, &error);
             test_error(error, "Unable to create command queue to test with");
         cl_uint refCount;
+        size_t size;
         error = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT,
                                       sizeof(refCount), &refCount, &size);
         test_error(error, "Unable to get command queue reference count");
@@ -442,11 +456,12 @@
         test_error(error, "param checking failed");
         error = command_queue_param_test(queue, CL_QUEUE_PROPERTIES,
-                                         queue_props[1], "properties");
+                                         queue_props_arg[1], "properties");
         test_error(error, "param checking failed");
     return 0;
 int test_get_command_queue_info(cl_device_id deviceID, cl_context context,
                                 cl_command_queue ignoreQueue, int num_elements)
@@ -511,26 +526,6 @@
     free( data );
-// All possible combinations of valid cl_mem_flags.
-static cl_mem_flags all_flags[16] = {
-  0,
 #define TEST_DEVICE_PARAM( device, paramName, val, name, type, cast )    \
 error = clGetDeviceInfo( device, paramName, sizeof( val ), &val, &size );        \
 test_error( error, "Unable to get device " name );                            \
@@ -824,5 +819,3 @@
     return 0;
diff --git a/test_conformance/api/test_sub_group_dispatch.cpp b/test_conformance/api/test_sub_group_dispatch.cpp
index 01d0ffa..61d9a52 100644
--- a/test_conformance/api/test_sub_group_dispatch.cpp
+++ b/test_conformance/api/test_sub_group_dispatch.cpp
@@ -56,11 +56,9 @@
 int test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-    static const size_t gsize0 = 80;
-    int i, error;
+    int error;
     size_t realSize;
     size_t kernel_max_subgroup_size, kernel_subgroup_count;
-    size_t global[] = {1,1,1};
     size_t max_local;
     cl_platform_id platform;
diff --git a/test_conformance/atomics/main.cpp b/test_conformance/atomics/main.cpp
index afdea37..987d6bf 100644
--- a/test_conformance/atomics/main.cpp
+++ b/test_conformance/atomics/main.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -24,6 +24,7 @@
 #include <unistd.h>
+// clang-format off
 test_definition test_list[] = {
     ADD_TEST( atomic_add ),
     ADD_TEST( atomic_sub ),
@@ -40,11 +41,11 @@
     ADD_TEST( atomic_add_index ),
     ADD_TEST( atomic_add_index_bin ),
+// clang-format on
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 int main(int argc, const char *argv[])
     return runTestHarness(argc, argv, test_num, test_list, false, 0);
diff --git a/test_conformance/atomics/procs.h b/test_conformance/atomics/procs.h
index bf053f2..fa85aad 100644
--- a/test_conformance/atomics/procs.h
+++ b/test_conformance/atomics/procs.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,22 +18,35 @@
 #include "harness/threadTesting.h"
 #include "harness/typeWrappers.h"
-extern int      create_program_and_kernel(const char *source, const char *kernel_name, cl_program *program_ret, cl_kernel *kernel_ret);
+extern int create_program_and_kernel(const char *source,
+                                     const char *kernel_name,
+                                     cl_program *program_ret,
+                                     cl_kernel *kernel_ret);
-extern int        test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_atomic_add(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_sub(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_xchg(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements);
+extern int test_atomic_min(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_max(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_inc(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_dec(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_atomic_and(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_atomic_or(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements);
+extern int test_atomic_xor(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
-extern int        test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_atomic_add_index(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+extern int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements);
diff --git a/test_conformance/atomics/testBase.h b/test_conformance/atomics/testBase.h
index ba67d14..22bce1d 100644
--- a/test_conformance/atomics/testBase.h
+++ b/test_conformance/atomics/testBase.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -26,6 +26,3 @@
 #include "procs.h"
 #endif // _testBase_h
diff --git a/test_conformance/atomics/test_atomics.cpp b/test_conformance/atomics/test_atomics.cpp
index 34b34ed..caa4b78 100644
--- a/test_conformance/atomics/test_atomics.cpp
+++ b/test_conformance/atomics/test_atomics.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,10 +19,12 @@
 #include <unistd.h>
+#include <cinttypes>
 #define INT_TEST_VALUE 402258822
 #define LONG_TEST_VALUE 515154531254381446LL
+// clang-format off
 const char *atomic_global_pattern[] = {
     "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
@@ -36,19 +38,20 @@
     "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
     "    int  tid = get_global_id(0);\n"
-    "     int  dstItemIdx;\n"
+    "    int  dstItemIdx;\n"
     "    // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
-    "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
+    "    for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
     "        destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
     "    // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
-    "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
+    "    for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
     "        finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
     "}\n" };
+// clang-format on
 #define TEST_COUNT 128 * 1024
@@ -56,41 +59,48 @@
 struct TestFns
-    cl_int    mIntStartValue;
-    cl_long    mLongStartValue;
+    cl_int mIntStartValue;
+    cl_long mLongStartValue;
-    size_t    (*NumResultsFn)( size_t threadSize, ExplicitType dataType );
+    size_t (*NumResultsFn)(size_t threadSize, ExplicitType dataType);
     // Integer versions
-    cl_int    (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue );
-    void    (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d );
-    bool    (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue );
+    cl_int (*ExpectedValueIntFn)(size_t size, cl_int *startRefValues,
+                                 size_t whichDestValue);
+    void (*GenerateRefsIntFn)(size_t size, cl_int *startRefValues, MTdata d);
+    bool (*VerifyRefsIntFn)(size_t size, cl_int *refValues, cl_int finalValue);
     // Long versions
-    cl_long    (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue );
-    void    (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d );
-    bool    (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue );
+    cl_long (*ExpectedValueLongFn)(size_t size, cl_long *startRefValues,
+                                   size_t whichDestValue);
+    void (*GenerateRefsLongFn)(size_t size, cl_long *startRefValues, MTdata d);
+    bool (*VerifyRefsLongFn)(size_t size, cl_long *refValues,
+                             cl_long finalValue);
     // Float versions
-    cl_float    (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue );
-    void        (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d );
-    bool        (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue );
+    cl_float (*ExpectedValueFloatFn)(size_t size, cl_float *startRefValues,
+                                     size_t whichDestValue);
+    void (*GenerateRefsFloatFn)(size_t size, cl_float *startRefValues,
+                                MTdata d);
+    bool (*VerifyRefsFloatFn)(size_t size, cl_float *refValues,
+                              cl_float finalValue);
-bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType )
+bool check_atomic_support(cl_device_id device, bool extended, bool isLocal,
+                          ExplicitType dataType)
+    // clang-format off
     const char *extensionNames[8] = {
         "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
         "cl_khr_local_int32_base_atomics",  "cl_khr_local_int32_extended_atomics",
         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics",
         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics"       // this line intended to be the same as the last one
+    // clang-format on
     size_t index = 0;
-    if( extended )
-        index += 1;
-    if( isLocal )
-        index += 2;
+    if (extended) index += 1;
+    if (isLocal) index += 2;
     Version version = get_device_cl_version(device);
@@ -98,26 +108,28 @@
         case kInt:
         case kUInt:
-            if( version >= Version(1,1) )
-                return 1;
+            if (version >= Version(1, 1)) return 1;
         case kLong:
-        case kULong:
-            index += 4;
-            break;
-        case kFloat:  // this has to stay separate since the float atomics arent in the 1.0 extensions
-            return version >= Version(1,1);
+        case kULong: index += 4; break;
+        case kFloat: // this has to stay separate since the float atomics arent
+                     // in the 1.0 extensions
+            return version >= Version(1, 1);
-            log_error( "ERROR:  Unsupported data type (%d) in check_atomic_support\n", dataType );
+            log_error(
+                "ERROR:  Unsupported data type (%d) in check_atomic_support\n",
+                dataType);
             return 0;
-    return is_extension_available( device, extensionNames[index] );
+    return is_extension_available(device, extensionNames[index]);
-int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
-                         TestFns testFns,
-                         bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize )
+int test_atomic_function(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements,
+                         const char *programCore, TestFns testFns,
+                         bool extended, bool isLocal, ExplicitType dataType,
+                         bool matchGroupSize)
     clProgramWrapper program;
     clKernelWrapper kernel;
@@ -127,55 +139,65 @@
     void *refValues, *startRefValues;
     size_t threadSize, groupSize;
     const char *programLines[4];
-    char pragma[ 512 ];
-    char programHeader[ 512 ];
+    char pragma[512];
+    char programHeader[512];
     MTdata d;
-    size_t typeSize = get_explicit_type_size( dataType );
+    size_t typeSize = get_explicit_type_size(dataType);
     // Verify we can run first
-    bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt );
-    if( !check_atomic_support( deviceID, extended, isLocal, dataType ) )
+    bool isUnsigned = (dataType == kULong) || (dataType == kUInt);
+    if (!check_atomic_support(deviceID, extended, isLocal, dataType))
-        // Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either)
-        if( dataType == kFloat )
-            log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" );
-        else if( !isUnsigned )
-            log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 );
+        // Only print for the signed (unsigned comes right after, and if signed
+        // isn't supported, unsigned isn't either)
+        if (dataType == kFloat)
+            log_info("\t%s float not supported\n",
+                     isLocal ? "Local" : "Global");
+        else if (!isUnsigned)
+            log_info("\t%s %sint%d not supported\n",
+                     isLocal ? "Local" : "Global", isUnsigned ? "u" : "",
+                     (int)typeSize * 8);
         // Since we don't support the operation, they implicitly pass
         return 0;
-        if( dataType == kFloat )
-            log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" );
+        if (dataType == kFloat)
+            log_info("\t%s float%s...", isLocal ? "local" : "global",
+                     isLocal ? " " : "");
-            log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "",
-                     (int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" );
+            log_info("\t%s %sint%d%s%s...", isLocal ? "local" : "global",
+                     isUnsigned ? "u" : "", (int)typeSize * 8,
+                     isUnsigned ? "" : " ", isLocal ? " " : "");
     //// Set up the kernel code
     // Create the pragma line for this kernel
-    bool isLong = ( dataType == kLong || dataType == kULong );
-    sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
-            isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32",
-            extended ? "extended" : "base" );
+    bool isLong = (dataType == kLong || dataType == kULong);
+    sprintf(pragma,
+            "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
+            isLong ? "" : (isLocal ? "_local" : "_global"),
+            isLong ? "64" : "32", extended ? "extended" : "base");
     // Now create the program header
-    const char *typeName = get_explicit_type_name( dataType );
-    if( isLocal )
-        sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName );
+    const char *typeName = get_explicit_type_name(dataType);
+    if (isLocal)
+        sprintf(programHeader, atomic_local_pattern[0], typeName, typeName,
+                typeName);
-        sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName );
+        sprintf(programHeader, atomic_global_pattern[0], typeName, typeName);
     // Set up our entire program now
-    programLines[ 0 ] = pragma;
-    programLines[ 1 ] = programHeader;
-    programLines[ 2 ] = programCore;
-    programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ];
+    programLines[0] = pragma;
+    programLines[1] = programHeader;
+    programLines[2] = programCore;
+    programLines[3] =
+        (isLocal) ? atomic_local_pattern[1] : atomic_global_pattern[1];
-    if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 4, programLines,
+                                    "test_atomic_fn"))
         return -1;
@@ -183,25 +205,37 @@
     //// Set up to actually run
     threadSize = num_elements;
-    error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize );
-    test_error( error, "Unable to get thread group max size" );
+    error =
+        get_max_common_work_group_size(context, kernel, threadSize, &groupSize);
+    test_error(error, "Unable to get thread group max size");
-    if( matchGroupSize )
+    if (matchGroupSize)
         // HACK because xchg and cmpxchg apparently are limited by hardware
         threadSize = groupSize;
-    if( isLocal )
+    if (isLocal)
-        size_t maxSizes[3] = {0, 0, 0};
-        error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), maxSizes, 0);
-        test_error( error, "Unable to obtain max work item sizes for the device" );
+        size_t maxSizes[3] = { 0, 0, 0 };
+        error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                                3 * sizeof(size_t), maxSizes, 0);
+        test_error(error,
+                   "Unable to obtain max work item sizes for the device");
         size_t workSize;
-        error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
-        test_error( error, "Unable to obtain max work group size for device and kernel combo" );
+        error = clGetKernelWorkGroupInfo(kernel, deviceID,
+                                         CL_KERNEL_WORK_GROUP_SIZE,
+                                         sizeof(workSize), &workSize, NULL);
+        test_error(
+            error,
+            "Unable to obtain max work group size for device and kernel combo");
-        // "workSize" is limited to that of the first dimension as only a 1DRange is executed.
-        if( maxSizes[0] < workSize )
+        // Limit workSize to avoid extremely large local buffer size and slow
+        // run.
+        if (workSize > 65536) workSize = 65536;
+        // "workSize" is limited to that of the first dimension as only a
+        // 1DRange is executed.
+        if (maxSizes[0] < workSize)
             workSize = maxSizes[0];
@@ -210,38 +244,43 @@
-    log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize );
+    log_info("\t(thread count %d, group size %d)\n", (int)threadSize,
+             (int)groupSize);
-    refValues = (cl_int *)malloc( typeSize * threadSize );
+    refValues = (cl_int *)malloc(typeSize * threadSize);
-    if( testFns.GenerateRefsIntFn != NULL )
+    if (testFns.GenerateRefsIntFn != NULL)
         // We have a ref generator provided
-        d = init_genrand( gRandomSeed );
-        startRefValues = malloc( typeSize * threadSize );
-        if( typeSize == 4 )
-            testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d );
+        d = init_genrand(gRandomSeed);
+        startRefValues = malloc(typeSize * threadSize);
+        if (typeSize == 4)
+            testFns.GenerateRefsIntFn(threadSize, (cl_int *)startRefValues, d);
-            testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d );
+            testFns.GenerateRefsLongFn(threadSize, (cl_long *)startRefValues,
+                                       d);
         d = NULL;
         startRefValues = NULL;
-    // If we're given a num_results function, we need to determine how many result objects we need. If
-    // we don't have it, we assume it's just 1
-    size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1;
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. If we don't have it, we assume it's just 1
+    size_t numDestItems = (testFns.NumResultsFn != NULL)
+        ? testFns.NumResultsFn(threadSize, dataType)
+        : 1;
-    char * destItems = new char[ typeSize * numDestItems ];
-    if( destItems == NULL )
+    char *destItems = new char[typeSize * numDestItems];
+    if (destItems == NULL)
-        log_error( "ERROR: Unable to allocate memory!\n" );
+        log_error("ERROR: Unable to allocate memory!\n");
         return -1;
-    void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue;
-    for( size_t i = 0; i < numDestItems; i++ )
-        memcpy( destItems + i * typeSize, startValue, typeSize );
+    void *startValue = (typeSize == 4) ? (void *)&testFns.mIntStartValue
+                                       : (void *)&testFns.mLongStartValue;
+    for (size_t i = 0; i < numDestItems; i++)
+        memcpy(destItems + i * typeSize, startValue, typeSize);
     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                 typeSize * numDestItems, destItems, NULL);
@@ -261,82 +300,97 @@
     /* Set the arguments */
-    error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
-    test_error( error, "Unable to set indexed kernel arguments" );
-    error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
-    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set indexed kernel arguments");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set indexed kernel arguments");
-    if( isLocal )
+    if (isLocal)
-        error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL );
-        test_error( error, "Unable to set indexed local kernel argument" );
+        error = clSetKernelArg(kernel, 2, typeSize * numDestItems, NULL);
+        test_error(error, "Unable to set indexed local kernel argument");
         cl_int numDestItemsInt = (cl_int)numDestItems;
-        error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt );
-        test_error( error, "Unable to set indexed kernel argument" );
+        error = clSetKernelArg(kernel, 3, sizeof(cl_int), &numDestItemsInt);
+        test_error(error, "Unable to set indexed kernel argument");
     /* Run the kernel */
     threads[0] = threadSize;
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL );
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, &groupSize,
+                                   0, NULL, NULL);
+    test_error(error, "Unable to execute test kernel");
-    error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
-    test_error( error, "Unable to read result value!" );
+    error =
+        clEnqueueReadBuffer(queue, streams[0], true, 0, typeSize * numDestItems,
+                            destItems, 0, NULL, NULL);
+    test_error(error, "Unable to read result value!");
-    error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL );
-    test_error( error, "Unable to read reference values!" );
+    error =
+        clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize * threadSize,
+                            refValues, 0, NULL, NULL);
+    test_error(error, "Unable to read reference values!");
-    // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
-    // have one, it's because we're comparing ref values only
-    if( testFns.ExpectedValueIntFn != NULL )
+    // If we have an expectedFn, then we need to generate a final value to
+    // compare against. If we don't have one, it's because we're comparing ref
+    // values only
+    if (testFns.ExpectedValueIntFn != NULL)
-        for( size_t i = 0; i < numDestItems; i++ )
+        for (size_t i = 0; i < numDestItems; i++)
-            char expected[ 8 ];
+            char expected[8];
             cl_int intVal;
             cl_long longVal;
-            if( typeSize == 4 )
+            if (typeSize == 4)
                 // Int version
-                intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i );
-                memcpy( expected, &intVal, sizeof( intVal ) );
+                intVal = testFns.ExpectedValueIntFn(
+                    threadSize, (cl_int *)startRefValues, i);
+                memcpy(expected, &intVal, sizeof(intVal));
                 // Long version
-                longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i );
-                memcpy( expected, &longVal, sizeof( longVal ) );
+                longVal = testFns.ExpectedValueLongFn(
+                    threadSize, (cl_long *)startRefValues, i);
+                memcpy(expected, &longVal, sizeof(longVal));
-            if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 )
+            if (memcmp(expected, destItems + i * typeSize, typeSize) != 0)
-                if( typeSize == 4 )
+                if (typeSize == 4)
-                    cl_int *outValue = (cl_int *)( destItems + i * typeSize );
-                    log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue );
+                    cl_int *outValue = (cl_int *)(destItems + i * typeSize);
+                    log_error("ERROR: Result %zu from kernel does not "
+                              "validate! (should be %d, was %d)\n",
+                              i, intVal, *outValue);
                     cl_int *startRefs = (cl_int *)startRefValues;
                     cl_int *refs = (cl_int *)refValues;
-                    for( i = 0; i < threadSize; i++ )
+                    for (i = 0; i < threadSize; i++)
-                        if( startRefs != NULL )
-                            log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] );
+                        if (startRefs != NULL)
+                            log_info(" --- %zu - %d --- %d\n", i, startRefs[i],
+                                     refs[i]);
-                            log_info( " --- %ld --- %d\n", i, refs[i] );
+                            log_info(" --- %zu --- %d\n", i, refs[i]);
-                    cl_long *outValue = (cl_long *)( destItems + i * typeSize );
-                    log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue );
+                    cl_long *outValue = (cl_long *)(destItems + i * typeSize);
+                    log_error("ERROR: Result %zu from kernel does not "
+                              "validate! (should be %" PRId64 ", was %" PRId64
+                              ")\n",
+                              i, longVal, *outValue);
                     cl_long *startRefs = (cl_long *)startRefValues;
                     cl_long *refs = (cl_long *)refValues;
-                    for( i = 0; i < threadSize; i++ )
+                    for (i = 0; i < threadSize; i++)
-                        if( startRefs != NULL )
-                            log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] );
+                        if (startRefs != NULL)
+                            log_info(" --- %zu - %" PRId64 " --- %" PRId64 "\n",
+                                     i, startRefs[i], refs[i]);
-                            log_info( " --- %ld --- %lld\n", i, refs[i] );
+                            log_info(" --- %zu --- %" PRId64 "\n", i, refs[i]);
                 return -1;
@@ -344,104 +398,141 @@
-    if( testFns.VerifyRefsIntFn != NULL )
+    if (testFns.VerifyRefsIntFn != NULL)
         /* Use the verify function to also check the results */
-        if( dataType == kFloat )
+        if (dataType == kFloat)
             cl_float *outValue = (cl_float *)destItems;
-            if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 )
+            if (!testFns.VerifyRefsFloatFn(threadSize, (cl_float *)refValues,
+                                           *outValue)
+                != 0)
-                log_error( "ERROR: Reference values did not validate!\n" );
+                log_error("ERROR: Reference values did not validate!\n");
                 return -1;
-        else if( typeSize == 4 )
+        else if (typeSize == 4)
             cl_int *outValue = (cl_int *)destItems;
-            if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 )
+            if (!testFns.VerifyRefsIntFn(threadSize, (cl_int *)refValues,
+                                         *outValue)
+                != 0)
-                log_error( "ERROR: Reference values did not validate!\n" );
+                log_error("ERROR: Reference values did not validate!\n");
                 return -1;
             cl_long *outValue = (cl_long *)destItems;
-            if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 )
+            if (!testFns.VerifyRefsLongFn(threadSize, (cl_long *)refValues,
+                                          *outValue)
+                != 0)
-                log_error( "ERROR: Reference values did not validate!\n" );
+                log_error("ERROR: Reference values did not validate!\n");
                 return -1;
-    else if( testFns.ExpectedValueIntFn == NULL )
+    else if (testFns.ExpectedValueIntFn == NULL)
-        log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" );
+        log_error("ERROR: Test doesn't check total or refs; no values are "
+                  "verified!\n");
         return -1;
     /* Re-write the starting value */
-    for( size_t i = 0; i < numDestItems; i++ )
-        memcpy( destItems + i * typeSize, startValue, typeSize );
-    error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
-    test_error( error, "Unable to write starting values!" );
+    for (size_t i = 0; i < numDestItems; i++)
+        memcpy(destItems + i * typeSize, startValue, typeSize);
+    error =
+        clEnqueueWriteBuffer(queue, streams[0], true, 0,
+                             typeSize * numDestItems, destItems, 0, NULL, NULL);
+    test_error(error, "Unable to write starting values!");
-    /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
+    /* Run the kernel once for a single thread, so we can verify that the
+     * returned value is the original one */
     threads[0] = 1;
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL );
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, threads, 0,
+                                   NULL, NULL);
+    test_error(error, "Unable to execute test kernel");
-    error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL );
-    test_error( error, "Unable to read reference values!" );
+    error = clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize, refValues,
+                                0, NULL, NULL);
+    test_error(error, "Unable to read reference values!");
-    if( memcmp( refValues, destItems, typeSize ) != 0 )
+    if (memcmp(refValues, destItems, typeSize) != 0)
-        if( typeSize == 4 )
+        if (typeSize == 4)
             cl_int *s = (cl_int *)destItems;
             cl_int *r = (cl_int *)refValues;
-            log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
-                      " (should have been %d, returned %d)!\n", *s, *r );
+            log_error("ERROR: atomic function operated correctly but did NOT "
+                      "return correct 'old' value "
+                      " (should have been %d, returned %d)!\n",
+                      *s, *r);
             cl_long *s = (cl_long *)destItems;
             cl_long *r = (cl_long *)refValues;
-            log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
-                      " (should have been %lld, returned %lld)!\n", *s, *r );
+            log_error("ERROR: atomic function operated correctly but did NOT "
+                      "return correct 'old' value "
+                      " (should have been %" PRId64 ", returned %" PRId64
+                      ")!\n",
+                      *s, *r);
         return -1;
-    delete [] destItems;
-    free( refValues );
-    if( startRefValues != NULL )
-        free( startRefValues );
+    delete[] destItems;
+    free(refValues);
+    if (startRefValues != NULL) free(startRefValues);
     return 0;
-int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
-                             TestFns testFns,
-                             bool extended, bool matchGroupSize, bool usingAtomicPrefix )
+int test_atomic_function_set(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements,
+                             const char *programCore, TestFns testFns,
+                             bool extended, bool matchGroupSize,
+                             bool usingAtomicPrefix)
-    log_info("    Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_");
+    log_info("    Testing %s functions...\n",
+             usingAtomicPrefix ? "atomic_" : "atom_");
     int errors = 0;
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kInt, matchGroupSize );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kUInt, matchGroupSize );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kInt, matchGroupSize );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kUInt, matchGroupSize );
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   programCore, testFns, extended, false, kInt,
+                                   matchGroupSize);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   programCore, testFns, extended, false, kUInt,
+                                   matchGroupSize);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   programCore, testFns, extended, true, kInt,
+                                   matchGroupSize);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   programCore, testFns, extended, true, kUInt,
+                                   matchGroupSize);
-    // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix.
-    // The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix.
-    if (!usingAtomicPrefix) {
-      errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kLong, matchGroupSize );
-      errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kULong, matchGroupSize );
-      errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kLong, matchGroupSize );
-      errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kULong, matchGroupSize );
+    // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64
+    // bit functions still use the "atom" prefix. The argument usingAtomicPrefix
+    // is set to true if programCore was generated with the "atomic" prefix.
+    if (!usingAtomicPrefix)
+    {
+        errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                       programCore, testFns, extended, false,
+                                       kLong, matchGroupSize);
+        errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                       programCore, testFns, extended, false,
+                                       kULong, matchGroupSize);
+        errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                       programCore, testFns, extended, true,
+                                       kLong, matchGroupSize);
+        errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                       programCore, testFns, extended, true,
+                                       kULong, matchGroupSize);
     return errors;
@@ -450,265 +541,346 @@
 #pragma mark ---- add
 const char atom_add_core[] =
-"    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
-"    atom_add( &destMemory[0], tid + 3 );\n"
-"   atom_add( &destMemory[0], tid + 3 );\n"
-"   atom_add( &destMemory[0], tid + 3 );\n";
+    "    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
+    "    atom_add( &destMemory[0], tid + 3 );\n"
+    "    atom_add( &destMemory[0], tid + 3 );\n"
+    "    atom_add( &destMemory[0], tid + 3 );\n";
 const char atomic_add_core[] =
-"    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
-"    atomic_add( &destMemory[0], tid + 3 );\n"
-"   atomic_add( &destMemory[0], tid + 3 );\n"
-"   atomic_add( &destMemory[0], tid + 3 );\n";
+    "    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
+    "    atomic_add( &destMemory[0], tid + 3 );\n"
+    "    atomic_add( &destMemory[0], tid + 3 );\n"
+    "    atomic_add( &destMemory[0], tid + 3 );\n";
-cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_add_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     cl_int total = 0;
-    for( size_t i = 0; i < size; i++ )
-        total += ( (cl_int)i + 3 ) * 4;
+    for (size_t i = 0; i < size; i++) total += ((cl_int)i + 3) * 4;
     return total;
-cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_add_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     cl_long total = 0;
-    for( size_t i = 0; i < size; i++ )
-        total += ( ( i + 3 ) * 4 );
+    for (size_t i = 0; i < size; i++) total += ((i + 3) * 4);
     return total;
-int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_add(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL };
+    TestFns set = { 0,
+                    0LL,
+                    NULL,
+                    test_atomic_add_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_add_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_add_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
-      return -1;
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_add_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
+        return -1;
     return 0;
 #pragma mark ---- sub
-const char atom_sub_core[] = "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
+const char atom_sub_core[] =
+    "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
-const char atomic_sub_core[] = "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
+const char atomic_sub_core[] =
+    "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
-cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_sub_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     cl_int total = INT_TEST_VALUE;
-    for( size_t i = 0; i < size; i++ )
-        total -= (cl_int)i + 3;
+    for (size_t i = 0; i < size; i++) total -= (cl_int)i + 3;
     return total;
-cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_sub_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     cl_long total = LONG_TEST_VALUE;
-    for( size_t i = 0; i < size; i++ )
-        total -= i + 3;
+    for (size_t i = 0; i < size; i++) total -= i + 3;
     return total;
-int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_sub(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL };
+    TestFns set = { INT_TEST_VALUE,
+                    LONG_TEST_VALUE,
+                    NULL,
+                    test_atomic_sub_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_sub_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_sub_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_sub_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
 #pragma mark ---- xchg
-const char atom_xchg_core[] = "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
+const char atom_xchg_core[] =
+    "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
-const char atomic_xchg_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
-const char atomic_xchg_float_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
+const char atomic_xchg_core[] =
+    "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
+const char atomic_xchg_float_core[] =
+    "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
-bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue )
+bool test_atomic_xchg_verify_int(size_t size, cl_int *refValues,
+                                 cl_int finalValue)
-    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
+    /* For xchg, each value from 0 to size - 1 should have an entry in the ref
+     * array, and ONLY one entry */
     char *valids;
     size_t i;
     char originalValidCount = 0;
-    valids = (char *)malloc( sizeof( char ) * size );
-    memset( valids, 0, sizeof( char ) * size );
+    valids = (char *)malloc(sizeof(char) * size);
+    memset(valids, 0, sizeof(char) * size);
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        if( refValues[ i ] == INT_TEST_VALUE )
+        if (refValues[i] == INT_TEST_VALUE)
             // Special initial value
-        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
+        if (refValues[i] < 0 || (size_t)refValues[i] >= size)
-            log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] );
+            log_error(
+                "ERROR: Reference value %zu outside of valid range! (%d)\n", i,
+                refValues[i]);
             return false;
-        valids[ refValues[ i ] ] ++;
+        valids[refValues[i]]++;
-    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
-     the final value outputted */
-    if( valids[ finalValue ] > 0 )
+    /* Note: ONE entry will have zero count. It'll be the last one that
+     executed, because that value should be the final value outputted */
+    if (valids[finalValue] > 0)
-        log_error( "ERROR: Final value %d was also in ref list!\n", finalValue );
+        log_error("ERROR: Final value %d was also in ref list!\n", finalValue);
         return false;
-        valids[ finalValue ] = 1;    // So the following loop will be okay
+        valids[finalValue] = 1; // So the following loop will be okay
     /* Now check that every entry has one and only one count */
-    if( originalValidCount != 1 )
+    if (originalValidCount != 1)
-        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
+        log_error("ERROR: Starting reference value %d did not occur "
+                  "once-and-only-once (occurred %d)\n",
+                  65191, originalValidCount);
         return false;
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        if( valids[ i ] != 1 )
+        if (valids[i] != 1)
-            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
-            for( size_t j = 0; j < size; j++ )
-                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
+            log_error("ERROR: Reference value %zu did not occur "
+                      "once-and-only-once (occurred %d)\n",
+                      i, valids[i]);
+            for (size_t j = 0; j < size; j++)
+                log_info("%d: %d\n", (int)j, (int)valids[j]);
             return false;
-    free( valids );
+    free(valids);
     return true;
-bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue )
+bool test_atomic_xchg_verify_long(size_t size, cl_long *refValues,
+                                  cl_long finalValue)
-    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
+    /* For xchg, each value from 0 to size - 1 should have an entry in the ref
+     * array, and ONLY one entry */
     char *valids;
     size_t i;
     char originalValidCount = 0;
-    valids = (char *)malloc( sizeof( char ) * size );
-    memset( valids, 0, sizeof( char ) * size );
+    valids = (char *)malloc(sizeof(char) * size);
+    memset(valids, 0, sizeof(char) * size);
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        if( refValues[ i ] == LONG_TEST_VALUE )
+        if (refValues[i] == LONG_TEST_VALUE)
             // Special initial value
-        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
+        if (refValues[i] < 0 || (size_t)refValues[i] >= size)
-            log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] );
+            log_error(
+                "ERROR: Reference value %zu outside of valid range! (%" PRId64
+                ")\n",
+                i, refValues[i]);
             return false;
-        valids[ refValues[ i ] ] ++;
+        valids[refValues[i]]++;
-    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
-     the final value outputted */
-    if( valids[ finalValue ] > 0 )
+    /* Note: ONE entry will have zero count. It'll be the last one that
+     executed, because that value should be the final value outputted */
+    if (valids[finalValue] > 0)
-        log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue );
+        log_error("ERROR: Final value %" PRId64 " was also in ref list!\n",
+                  finalValue);
         return false;
-        valids[ finalValue ] = 1;    // So the following loop will be okay
+        valids[finalValue] = 1; // So the following loop will be okay
     /* Now check that every entry has one and only one count */
-    if( originalValidCount != 1 )
+    if (originalValidCount != 1)
-        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
+        log_error("ERROR: Starting reference value %d did not occur "
+                  "once-and-only-once (occurred %d)\n",
+                  65191, originalValidCount);
         return false;
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        if( valids[ i ] != 1 )
+        if (valids[i] != 1)
-            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
-            for( size_t j = 0; j < size; j++ )
-                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
+            log_error("ERROR: Reference value %zu did not occur "
+                      "once-and-only-once (occurred %d)\n",
+                      i, valids[i]);
+            for (size_t j = 0; j < size; j++)
+                log_info("%d: %d\n", (int)j, (int)valids[j]);
             return false;
-    free( valids );
+    free(valids);
     return true;
-bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue )
+bool test_atomic_xchg_verify_float(size_t size, cl_float *refValues,
+                                   cl_float finalValue)
-    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
+    /* For xchg, each value from 0 to size - 1 should have an entry in the ref
+     * array, and ONLY one entry */
     char *valids;
     size_t i;
     char originalValidCount = 0;
-    valids = (char *)malloc( sizeof( char ) * size );
-    memset( valids, 0, sizeof( char ) * size );
+    valids = (char *)malloc(sizeof(char) * size);
+    memset(valids, 0, sizeof(char) * size);
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        cl_int *intRefValue = (cl_int *)( &refValues[ i ] );
-        if( *intRefValue == INT_TEST_VALUE )
+        cl_int *intRefValue = (cl_int *)(&refValues[i]);
+        if (*intRefValue == INT_TEST_VALUE)
             // Special initial value
-        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
+        if (refValues[i] < 0 || (size_t)refValues[i] >= size)
-            log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] );
+            log_error(
+                "ERROR: Reference value %zu outside of valid range! (%a)\n", i,
+                refValues[i]);
             return false;
-        valids[ (int)refValues[ i ] ] ++;
+        valids[(int)refValues[i]]++;
-    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
-     the final value outputted */
-    if( valids[ (int)finalValue ] > 0 )
+    /* Note: ONE entry will have zero count. It'll be the last one that
+     executed, because that value should be the final value outputted */
+    if (valids[(int)finalValue] > 0)
-        log_error( "ERROR: Final value %a was also in ref list!\n", finalValue );
+        log_error("ERROR: Final value %a was also in ref list!\n", finalValue);
         return false;
-        valids[ (int)finalValue ] = 1;    // So the following loop will be okay
+        valids[(int)finalValue] = 1; // So the following loop will be okay
     /* Now check that every entry has one and only one count */
-    if( originalValidCount != 1 )
+    if (originalValidCount != 1)
-        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
+        log_error("ERROR: Starting reference value %d did not occur "
+                  "once-and-only-once (occurred %d)\n",
+                  65191, originalValidCount);
         return false;
-    for( i = 0; i < size; i++ )
+    for (i = 0; i < size; i++)
-        if( valids[ i ] != 1 )
+        if (valids[i] != 1)
-            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
-            for( size_t j = 0; j < size; j++ )
-                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
+            log_error("ERROR: Reference value %zu did not occur "
+                      "once-and-only-once (occurred %d)\n",
+                      i, valids[i]);
+            for (size_t j = 0; j < size; j++)
+                log_info("%d: %d\n", (int)j, (int)valids[j]);
             return false;
-    free( valids );
+    free(valids);
     return true;
-int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_xchg(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
-    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float };
+    TestFns set = { INT_TEST_VALUE,
+                    LONG_TEST_VALUE,
+                    NULL,
+                    NULL,
+                    NULL,
+                    test_atomic_xchg_verify_int,
+                    NULL,
+                    NULL,
+                    test_atomic_xchg_verify_long,
+                    NULL,
+                    NULL,
+                    test_atomic_xchg_verify_float };
-    int errors = test_atomic_function_set( deviceID, context, queue, num_elements, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false  );
-    errors |= test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true  );
+    int errors = test_atomic_function_set(
+        deviceID, context, queue, num_elements, atom_xchg_core, set, false,
+        true, /*usingAtomicPrefix*/ false);
+    errors |= test_atomic_function_set(deviceID, context, queue, num_elements,
+                                       atomic_xchg_core, set, false, true,
+                                       /*usingAtomicPrefix*/ true);
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, false, kFloat, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, true, kFloat, true );
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atomic_xchg_float_core, set, false, false,
+                                   kFloat, true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atomic_xchg_float_core, set, false, true,
+                                   kFloat, true);
     return errors;
@@ -716,51 +888,71 @@
 #pragma mark ---- min
-const char atom_min_core[] = "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
+const char atom_min_core[] =
+    "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
-const char atomic_min_core[] = "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
+const char atomic_min_core[] =
+    "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
-cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_min_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     cl_int total = 0x7fffffffL;
-    for( size_t i = 0; i < size; i++ )
+    for (size_t i = 0; i < size; i++)
-        if( startRefValues[ i ] < total )
-            total = startRefValues[ i ];
+        if (startRefValues[i] < total) total = startRefValues[i];
     return total;
-void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d )
+void test_atomic_min_gen_int(size_t size, cl_int *startRefValues, MTdata d)
-    for( size_t i = 0; i < size; i++ )
-        startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
+    for (size_t i = 0; i < size; i++)
+        startRefValues[i] =
+            (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
-cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_min_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     cl_long total = 0x7fffffffffffffffLL;
-    for( size_t i = 0; i < size; i++ )
+    for (size_t i = 0; i < size; i++)
-        if( startRefValues[ i ] < total )
-            total = startRefValues[ i ];
+        if (startRefValues[i] < total) total = startRefValues[i];
     return total;
-void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d )
+void test_atomic_min_gen_long(size_t size, cl_long *startRefValues, MTdata d)
-    for( size_t i = 0; i < size; i++ )
-        startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
+    for (size_t i = 0; i < size; i++)
+        startRefValues[i] =
+            (cl_long)(genrand_int32(d)
+                      | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
-int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_min(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL };
+    TestFns set = { 0x7fffffffL,
+                    0x7fffffffffffffffLL,
+                    NULL,
+                    test_atomic_min_result_int,
+                    test_atomic_min_gen_int,
+                    NULL,
+                    test_atomic_min_result_long,
+                    test_atomic_min_gen_long,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_min_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_min_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
@@ -768,79 +960,118 @@
 #pragma mark ---- max
-const char atom_max_core[] = "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
+const char atom_max_core[] =
+    "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
-const char atomic_max_core[] = "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
+const char atomic_max_core[] =
+    "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
-cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_max_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     cl_int total = 0;
-    for( size_t i = 0; i < size; i++ )
+    for (size_t i = 0; i < size; i++)
-        if( startRefValues[ i ] > total )
-            total = startRefValues[ i ];
+        if (startRefValues[i] > total) total = startRefValues[i];
     return total;
-void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d )
+void test_atomic_max_gen_int(size_t size, cl_int *startRefValues, MTdata d)
-    for( size_t i = 0; i < size; i++ )
-        startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
+    for (size_t i = 0; i < size; i++)
+        startRefValues[i] =
+            (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
-cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_max_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     cl_long total = 0;
-    for( size_t i = 0; i < size; i++ )
+    for (size_t i = 0; i < size; i++)
-        if( startRefValues[ i ] > total )
-            total = startRefValues[ i ];
+        if (startRefValues[i] > total) total = startRefValues[i];
     return total;
-void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d )
+void test_atomic_max_gen_long(size_t size, cl_long *startRefValues, MTdata d)
-    for( size_t i = 0; i < size; i++ )
-        startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
+    for (size_t i = 0; i < size; i++)
+        startRefValues[i] =
+            (cl_long)(genrand_int32(d)
+                      | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
-int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_max(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL };
+    TestFns set = { 0,
+                    0,
+                    NULL,
+                    test_atomic_max_result_int,
+                    test_atomic_max_gen_int,
+                    NULL,
+                    test_atomic_max_result_long,
+                    test_atomic_max_gen_long,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_max_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
-      return -1;
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_max_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
+        return -1;
     return 0;
 #pragma mark ---- inc
-const char atom_inc_core[] = "    oldValues[tid] = atom_inc( &destMemory[0] );\n";
+const char atom_inc_core[] =
+    "    oldValues[tid] = atom_inc( &destMemory[0] );\n";
-const char atomic_inc_core[] = "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";
+const char atomic_inc_core[] =
+    "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";
-cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_inc_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     return INT_TEST_VALUE + (cl_int)size;
-cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_inc_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     return LONG_TEST_VALUE + size;
-int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_inc(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL };
+    TestFns set = { INT_TEST_VALUE,
+                    LONG_TEST_VALUE,
+                    NULL,
+                    test_atomic_inc_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_inc_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_inc_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_inc_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
@@ -848,27 +1079,46 @@
 #pragma mark ---- dec
-const char atom_dec_core[] = "    oldValues[tid] = atom_dec( &destMemory[0] );\n";
+const char atom_dec_core[] =
+    "    oldValues[tid] = atom_dec( &destMemory[0] );\n";
-const char atomic_dec_core[] = "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";
+const char atomic_dec_core[] =
+    "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";
-cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_dec_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichDestValue)
     return INT_TEST_VALUE - (cl_int)size;
-cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_dec_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichDestValue)
     return LONG_TEST_VALUE - size;
-int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_dec(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL };
+    TestFns set = { INT_TEST_VALUE,
+                    LONG_TEST_VALUE,
+                    NULL,
+                    test_atomic_dec_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_dec_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_dec_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_dec_core, set, false,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
@@ -877,129 +1127,159 @@
 #pragma mark ---- cmpxchg
 /* We test cmpxchg by implementing (the long way) atom_add */
+// clang-format off
 const char atom_cmpxchg_core[] =
-"    int oldValue, origValue, newValue;\n"
-"    do { \n"
-"        origValue = destMemory[0];\n"
-"        newValue = origValue + tid + 2;\n"
-"        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
-"    } while( oldValue != origValue );\n"
-"    oldValues[tid] = oldValue;\n"
+    "    int oldValue, origValue, newValue;\n"
+    "    do { \n"
+    "        origValue = destMemory[0];\n"
+    "        newValue = origValue + tid + 2;\n"
+    "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
+    "    } while( oldValue != origValue );\n"
+    "    oldValues[tid] = oldValue;\n";
 const char atom_cmpxchg64_core[] =
-"    long oldValue, origValue, newValue;\n"
-"    do { \n"
-"        origValue = destMemory[0];\n"
-"        newValue = origValue + tid + 2;\n"
-"        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
-"    } while( oldValue != origValue );\n"
-"    oldValues[tid] = oldValue;\n"
+    "    long oldValue, origValue, newValue;\n"
+    "    do { \n"
+    "        origValue = destMemory[0];\n"
+    "        newValue = origValue + tid + 2;\n"
+    "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
+    "    } while( oldValue != origValue );\n"
+    "    oldValues[tid] = oldValue;\n";
 const char atomic_cmpxchg_core[] =
-"    int oldValue, origValue, newValue;\n"
-"    do { \n"
-"        origValue = destMemory[0];\n"
-"        newValue = origValue + tid + 2;\n"
-"        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
-"    } while( oldValue != origValue );\n"
-"    oldValues[tid] = oldValue;\n"
+    "    int oldValue, origValue, newValue;\n"
+    "    do { \n"
+    "        origValue = destMemory[0];\n"
+    "        newValue = origValue + tid + 2;\n"
+    "        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
+    "    } while( oldValue != origValue );\n"
+    "    oldValues[tid] = oldValue;\n";
+// clang-format on
-cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
+cl_int test_atomic_cmpxchg_result_int(size_t size, cl_int *startRefValues,
+                                      size_t whichDestValue)
     cl_int total = INT_TEST_VALUE;
-    for( size_t i = 0; i < size; i++ )
-        total += (cl_int)i + 2;
+    for (size_t i = 0; i < size; i++) total += (cl_int)i + 2;
     return total;
-cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
+cl_long test_atomic_cmpxchg_result_long(size_t size, cl_long *startRefValues,
+                                        size_t whichDestValue)
     cl_long total = LONG_TEST_VALUE;
-    for( size_t i = 0; i < size; i++ )
-        total += i + 2;
+    for (size_t i = 0; i < size; i++) total += i + 2;
     return total;
-int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
-    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL };
+    TestFns set = { INT_TEST_VALUE,
+                    LONG_TEST_VALUE,
+                    NULL,
+                    test_atomic_cmpxchg_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_cmpxchg_result_long,
+                    NULL,
+                    NULL };
     int errors = 0;
     log_info("    Testing atom_ functions...\n");
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kUInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kUInt, true );
+    errors |=
+        test_atomic_function(deviceID, context, queue, num_elements,
+                             atom_cmpxchg_core, set, false, false, kInt, true);
+    errors |=
+        test_atomic_function(deviceID, context, queue, num_elements,
+                             atom_cmpxchg_core, set, false, false, kUInt, true);
+    errors |=
+        test_atomic_function(deviceID, context, queue, num_elements,
+                             atom_cmpxchg_core, set, false, true, kInt, true);
+    errors |=
+        test_atomic_function(deviceID, context, queue, num_elements,
+                             atom_cmpxchg_core, set, false, true, kUInt, true);
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kLong, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kULong, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kLong, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kULong, true );
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atom_cmpxchg64_core, set, false, false,
+                                   kLong, true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atom_cmpxchg64_core, set, false, false,
+                                   kULong, true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atom_cmpxchg64_core, set, false, true, kLong,
+                                   true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atom_cmpxchg64_core, set, false, true,
+                                   kULong, true);
     log_info("    Testing atomic_ functions...\n");
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kUInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kInt, true );
-    errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kUInt, true );
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atomic_cmpxchg_core, set, false, false, kInt,
+                                   true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atomic_cmpxchg_core, set, false, false,
+                                   kUInt, true);
+    errors |=
+        test_atomic_function(deviceID, context, queue, num_elements,
+                             atomic_cmpxchg_core, set, false, true, kInt, true);
+    errors |= test_atomic_function(deviceID, context, queue, num_elements,
+                                   atomic_cmpxchg_core, set, false, true, kUInt,
+                                   true);
-    if( errors )
-        return -1;
+    if (errors) return -1;
     return 0;
 #pragma mark -------- Bitwise functions
-size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType )
+size_t test_bitwise_num_results(size_t threadCount, ExplicitType dataType)
-    size_t numBits = get_explicit_type_size( dataType ) * 8;
+    size_t numBits = get_explicit_type_size(dataType) * 8;
-    return ( threadCount + numBits - 1 ) / numBits;
+    return (threadCount + numBits - 1) / numBits;
 #pragma mark ---- and
+// clang-format off
 const char atom_and_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  whichResult = tid / numBits;\n"
-"    int  bitIndex = tid - ( whichResult * numBits );\n"
-"    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  whichResult = tid / numBits;\n"
+    "    int  bitIndex = tid - ( whichResult * numBits );\n"
+    "\n"
+    "    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
 const char atomic_and_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  whichResult = tid / numBits;\n"
-"    int  bitIndex = tid - ( whichResult * numBits );\n"
-"    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  whichResult = tid / numBits;\n"
+    "    int  bitIndex = tid - ( whichResult * numBits );\n"
+    "\n"
+    "    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
+// clang-format on
-cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
+cl_int test_atomic_and_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichResult)
-    size_t numThreads = ( (size_t)size + 31 ) / 32;
-    if( whichResult < numThreads - 1 )
-        return 0;
+    size_t numThreads = ((size_t)size + 31) / 32;
+    if (whichResult < numThreads - 1) return 0;
     // Last item doesn't get and'ed on every bit, so we have to mask away
     size_t numBits = (size_t)size - whichResult * 32;
     cl_int bits = (cl_int)0xffffffffL;
-    for( size_t i = 0; i < numBits; i++ )
-        bits &= ~( 1 << i );
+    for (size_t i = 0; i < numBits; i++) bits &= ~(1 << i);
     return bits;
-cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
+cl_long test_atomic_and_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichResult)
-    size_t numThreads = ( (size_t)size + 63 ) / 64;
-    if( whichResult < numThreads - 1 )
-        return 0;
+    size_t numThreads = ((size_t)size + 63) / 64;
+    if (whichResult < numThreads - 1) return 0;
     // Last item doesn't get and'ed on every bit, so we have to mask away
     size_t numBits = (size_t)size - whichResult * 64;
@@ -1009,14 +1289,28 @@
     return bits;
-int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_and(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results,
-        test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL };
+    TestFns set = { 0xffffffff,
+                    0xffffffffffffffffLL,
+                    test_bitwise_num_results,
+                    test_atomic_and_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_and_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_and_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_and_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
@@ -1024,59 +1318,68 @@
 #pragma mark ---- or
+// clang-format off
 const char atom_or_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  whichResult = tid / numBits;\n"
-"    int  bitIndex = tid - ( whichResult * numBits );\n"
-"    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  whichResult = tid / numBits;\n"
+    "    int  bitIndex = tid - ( whichResult * numBits );\n"
+    "\n"
+    "    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
 const char atomic_or_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  whichResult = tid / numBits;\n"
-"    int  bitIndex = tid - ( whichResult * numBits );\n"
-"    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  whichResult = tid / numBits;\n"
+    "    int  bitIndex = tid - ( whichResult * numBits );\n"
+    "\n"
+    "    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
+// clang-format on
-cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
+cl_int test_atomic_or_result_int(size_t size, cl_int *startRefValues,
+                                 size_t whichResult)
-    size_t numThreads = ( (size_t)size + 31 ) / 32;
-    if( whichResult < numThreads - 1 )
-        return 0xffffffff;
+    size_t numThreads = ((size_t)size + 31) / 32;
+    if (whichResult < numThreads - 1) return 0xffffffff;
     // Last item doesn't get and'ed on every bit, so we have to mask away
     size_t numBits = (size_t)size - whichResult * 32;
     cl_int bits = 0;
-    for( size_t i = 0; i < numBits; i++ )
-        bits |= ( 1 << i );
+    for (size_t i = 0; i < numBits; i++) bits |= (1 << i);
     return bits;
-cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
+cl_long test_atomic_or_result_long(size_t size, cl_long *startRefValues,
+                                   size_t whichResult)
-    size_t numThreads = ( (size_t)size + 63 ) / 64;
-    if( whichResult < numThreads - 1 )
-        return 0x0ffffffffffffffffLL;
+    size_t numThreads = ((size_t)size + 63) / 64;
+    if (whichResult < numThreads - 1) return 0x0ffffffffffffffffLL;
     // Last item doesn't get and'ed on every bit, so we have to mask away
     size_t numBits = (size_t)size - whichResult * 64;
     cl_long bits = 0;
-    for( size_t i = 0; i < numBits; i++ )
-        bits |= ( 1LL << i );
+    for (size_t i = 0; i < numBits; i++) bits |= (1LL << i);
     return bits;
-int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_or(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
-    TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL };
+    TestFns set = {
+        0,    0LL,  test_bitwise_num_results,   test_atomic_or_result_int,
+        NULL, NULL, test_atomic_or_result_long, NULL,
+        NULL
+    };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_or_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_or_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
@@ -1096,33 +1399,44 @@
     "    oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
-cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
+cl_int test_atomic_xor_result_int(size_t size, cl_int *startRefValues,
+                                  size_t whichResult)
     cl_int total = 0x2f08ab41;
-    for( size_t i = 0; i < size; i++ )
-        total ^= ( 1 << ( i & 31 ) );
+    for (size_t i = 0; i < size; i++) total ^= (1 << (i & 31));
     return total;
-cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
+cl_long test_atomic_xor_result_long(size_t size, cl_long *startRefValues,
+                                    size_t whichResult)
     cl_long total = 0x2f08ab418ba0541LL;
-    for( size_t i = 0; i < size; i++ )
-        total ^= ( 1LL << ( i & 63 ) );
+    for (size_t i = 0; i < size; i++) total ^= (1LL << (i & 63));
     return total;
-int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_xor(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-    TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL };
+    TestFns set = { 0x2f08ab41,
+                    0x2f08ab418ba0541LL,
+                    NULL,
+                    test_atomic_xor_result_int,
+                    NULL,
+                    NULL,
+                    test_atomic_xor_result_long,
+                    NULL,
+                    NULL };
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atom_xor_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
+        != 0)
         return -1;
-    if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
+    if (test_atomic_function_set(
+            deviceID, context, queue, num_elements, atomic_xor_core, set, true,
+            /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
+        != 0)
         return -1;
     return 0;
diff --git a/test_conformance/atomics/test_indexed_cases.cpp b/test_conformance/atomics/test_indexed_cases.cpp
index b85e3d2..2bba3e2 100644
--- a/test_conformance/atomics/test_indexed_cases.cpp
+++ b/test_conformance/atomics/test_indexed_cases.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,48 +16,55 @@
 #include "testBase.h"
 #include "harness/conversions.h"
-const char * atomic_index_source =
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"// Counter keeps track of which index in counts we are using.\n"
-"// We get that value, increment it, and then set that index in counts to our thread ID.\n"
-"// At the end of this we should have all thread IDs in some random location in counts\n"
-"// exactly once. If atom_add failed then we will write over various thread IDs and we\n"
-"// will be missing some.\n"
-"__kernel void add_index_test(__global int *counter, __global int *counts) {\n"
-"    int tid = get_global_id(0);\n"
-"    \n"
-"    int counter_to_use = atom_add(counter, 1);\n"
-"    counts[counter_to_use] = tid;\n"
+// clang-format off
+const char *atomic_index_source =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "// Counter keeps track of which index in counts we are using.\n"
+    "// We get that value, increment it, and then set that index in counts to our thread ID.\n"
+    "// At the end of this we should have all thread IDs in some random location in counts\n"
+    "// exactly once. If atom_add failed then we will write over various thread IDs and we\n"
+    "// will be missing some.\n"
+    "\n"
+    "__kernel void add_index_test(__global int *counter, __global int *counts) {\n"
+    "    int tid = get_global_id(0);\n"
+    "    \n"
+    "    int counter_to_use = atom_add(counter, 1);\n"
+    "    counts[counter_to_use] = tid;\n"
+    "}";
+// clang-format on
-int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_add_index(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper counter, counters;
     size_t numGlobalThreads, numLocalThreads;
-    int fail = 0, succeed = 0, err;
+    int fail = 0, err;
-  /* Check if atomics are supported. */
-  if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
-    log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
-    return 0;
-  }
+    /* Check if atomics are supported. */
+    if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics"))
+    {
+        log_info("Base atomics not supported "
+                 "(cl_khr_global_int32_base_atomics). Skipping test.\n");
+        return 0;
+    }
     //===== add_index test
     // The index test replicates what particles does.
-    // It uses one memory location to keep track of the current index and then each thread
-    // does an atomic add to it to get its new location. The threads then write to their
-    // assigned location. At the end we check to make sure that each thread's ID shows up
-    // exactly once in the output.
+    // It uses one memory location to keep track of the current index and then
+    // each thread does an atomic add to it to get its new location. The threads
+    // then write to their assigned location. At the end we check to make sure
+    // that each thread's ID shows up exactly once in the output.
     numGlobalThreads = 2048;
-    if( create_single_kernel_helper( context, &program, &kernel, 1, &atomic_index_source, "add_index_test" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    &atomic_index_source, "add_index_test"))
         return -1;
-    if( get_max_common_work_group_size( context, kernel, numGlobalThreads, &numLocalThreads ) )
+    if (get_max_common_work_group_size(context, kernel, numGlobalThreads,
+                                       &numLocalThreads))
         return -1;
     log_info("Execute global_threads:%d local_threads:%d\n",
@@ -72,103 +79,148 @@
                               sizeof(cl_int) * numGlobalThreads, NULL, NULL);
     // Reset all those locations to -1 to indciate they have not been used.
-    cl_int *values = (cl_int*) malloc(sizeof(cl_int)*numGlobalThreads);
-    if (values == NULL) {
-        log_error("add_index_test FAILED to allocate memory for initial values.\n");
-        fail = 1; succeed = -1;
-    } else {
+    cl_int *values = (cl_int *)malloc(sizeof(cl_int) * numGlobalThreads);
+    if (values == NULL)
+    {
+        log_error(
+            "add_index_test FAILED to allocate memory for initial values.\n");
+        fail = 1;
+    }
+    else
+    {
         memset(values, -1, numLocalThreads);
-        unsigned int i=0;
-        for (i=0; i<numGlobalThreads; i++)
-            values[i] = -1;
-        int init=0;
-        err = clEnqueueWriteBuffer(queue, counters, true, 0, numGlobalThreads*sizeof(cl_int), values, 0, NULL, NULL);
-        err |= clEnqueueWriteBuffer(queue, counter, true, 0,1*sizeof(cl_int), &init, 0, NULL, NULL);
-        if (err) {
-            log_error("add_index_test FAILED to write initial values to arrays: %d\n", err);
-            fail=1; succeed=-1;
-        } else {
+        unsigned int i = 0;
+        for (i = 0; i < numGlobalThreads; i++) values[i] = -1;
+        int init = 0;
+        err = clEnqueueWriteBuffer(queue, counters, true, 0,
+                                   numGlobalThreads * sizeof(cl_int), values, 0,
+                                   NULL, NULL);
+        err |= clEnqueueWriteBuffer(queue, counter, true, 0, 1 * sizeof(cl_int),
+                                    &init, 0, NULL, NULL);
+        if (err)
+        {
+            log_error(
+                "add_index_test FAILED to write initial values to arrays: %d\n",
+                err);
+            fail = 1;
+        }
+        else
+        {
             err = clSetKernelArg(kernel, 0, sizeof(counter), &counter);
             err |= clSetKernelArg(kernel, 1, sizeof(counters), &counters);
-            if (err) {
-                log_error("add_index_test FAILED to set kernel arguments: %d\n", err);
-                fail=1; succeed=-1;
-            } else {
-                err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numGlobalThreads, &numLocalThreads, 0, NULL, NULL );
-                if (err) {
-                    log_error("add_index_test FAILED to execute kernel: %d\n", err);
-                    fail=1; succeed=-1;
-                } else {
-                    err = clEnqueueReadBuffer( queue, counters, true, 0, sizeof(cl_int)*numGlobalThreads, values, 0, NULL, NULL );
-                    if (err) {
-                        log_error("add_index_test FAILED to read back results: %d\n", err);
-                        fail = 1; succeed=-1;
-                    } else {
+            if (err)
+            {
+                log_error("add_index_test FAILED to set kernel arguments: %d\n",
+                          err);
+                fail = 1;
+            }
+            else
+            {
+                err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+                                             &numGlobalThreads,
+                                             &numLocalThreads, 0, NULL, NULL);
+                if (err)
+                {
+                    log_error("add_index_test FAILED to execute kernel: %d\n",
+                              err);
+                    fail = 1;
+                }
+                else
+                {
+                    err = clEnqueueReadBuffer(queue, counters, true, 0,
+                                              sizeof(cl_int) * numGlobalThreads,
+                                              values, 0, NULL, NULL);
+                    if (err)
+                    {
+                        log_error(
+                            "add_index_test FAILED to read back results: %d\n",
+                            err);
+                        fail = 1;
+                    }
+                    else
+                    {
                         unsigned int looking_for, index;
-                        for (looking_for=0; looking_for<numGlobalThreads; looking_for++) {
-                            int instances_found=0;
-                            for (index=0; index<numGlobalThreads; index++) {
-                                if (values[index]==(int)looking_for)
+                        for (looking_for = 0; looking_for < numGlobalThreads;
+                             looking_for++)
+                        {
+                            int instances_found = 0;
+                            for (index = 0; index < numGlobalThreads; index++)
+                            {
+                                if (values[index] == (int)looking_for)
-                            if (instances_found != 1) {
-                                log_error("add_index_test FAILED: wrong number of instances (%d!=1) for counter %d.\n", instances_found, looking_for);
-                                fail = 1; succeed=-1;
+                            if (instances_found != 1)
+                            {
+                                log_error(
+                                    "add_index_test FAILED: wrong number of "
+                                    "instances (%d!=1) for counter %d.\n",
+                                    instances_found, looking_for);
+                                fail = 1;
-        if (!fail) {
-            log_info("add_index_test passed. Each thread used exactly one index.\n");
+        if (!fail)
+        {
+            log_info(
+                "add_index_test passed. Each thread used exactly one index.\n");
     return fail;
+// clang-format off
 const char *add_index_bin_kernel[] = {
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n"
-"// using an atomic add to keep track of the current location to write into in each bin.\n"
-"// This is the same as the memory update for the particles demo.\n"
-"__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n"
-"    int tid = get_global_id(0);\n"
-"    int location = bin_assignments[tid];\n"
-"    int counter = atom_add(&bin_counters[location], 1);\n"
-"    bins[location*max_counts_per_bin + counter] = tid;\n"
-"}" };
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n"
+    "// using an atomic add to keep track of the current location to write into in each bin.\n"
+    "// This is the same as the memory update for the particles demo.\n"
+    "\n"
+    "__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n"
+    "    int tid = get_global_id(0);\n"
+    "\n"
+    "    int location = bin_assignments[tid];\n"
+    "    int counter = atom_add(&bin_counters[location], 1);\n"
+    "    bins[location*max_counts_per_bin + counter] = tid;\n"
+    "}" };
+// clang-format on
-// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel
-// using an atomic add to keep track of the current location to write into in each bin.
-// This is the same as the memory update for the particles demo.
-int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_context context, MTdata d)
+// This test assigns a bunch of values to bins and then tries to put them in the
+// bins in parallel using an atomic add to keep track of the current location to
+// write into in each bin. This is the same as the memory update for the
+// particles demo.
+int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
+                       cl_context context, MTdata d)
     int number_of_items = (int)global_threads[0];
     size_t local_threads[1];
     int divisor = 12;
-    int number_of_bins = number_of_items/divisor;
-    int max_counts_per_bin = divisor*2;
+    int number_of_bins = number_of_items / divisor;
+    int max_counts_per_bin = divisor * 2;
     int fail = 0;
-    int succeed = 0;
     int err;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    //  log_info("add_index_bin_test: %d items, into %d bins, with a max of %d items per bin (bins is %d long).\n",
-    //           number_of_items, number_of_bins, max_counts_per_bin, number_of_bins*max_counts_per_bin);
+    //  log_info("add_index_bin_test: %d items, into %d bins, with a max of %d
+    //  items per bin (bins is %d long).\n",
+    //           number_of_items, number_of_bins, max_counts_per_bin,
+    //           number_of_bins*max_counts_per_bin);
     //===== add_index_bin test
     // The index test replicates what particles does.
-    err = create_single_kernel_helper(context, &program, &kernel, 1, add_index_bin_kernel, "add_index_bin_test" );
-    test_error( err, "Unable to create testing kernel" );
+    err =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    add_index_bin_kernel, "add_index_bin_test");
+    test_error(err, "Unable to create testing kernel");
-    if( get_max_common_work_group_size( context, kernel, global_threads[0], &local_threads[0] ) )
+    if (get_max_common_work_group_size(context, kernel, global_threads[0],
+                                       &local_threads[0]))
         return -1;
     log_info("Execute global_threads:%d local_threads:%d\n",
@@ -185,152 +237,228 @@
         clCreateBuffer(context, CL_MEM_READ_ONLY,
                        sizeof(cl_int) * number_of_items, NULL, NULL);
-    if (bin_counters == NULL) {
+    if (bin_counters == NULL)
+    {
         log_error("add_index_bin_test FAILED to allocate bin_counters.\n");
         return -1;
-    if (bins == NULL) {
+    if (bins == NULL)
+    {
         log_error("add_index_bin_test FAILED to allocate bins.\n");
         return -1;
-    if (bin_assignments == NULL) {
+    if (bin_assignments == NULL)
+    {
         log_error("add_index_bin_test FAILED to allocate bin_assignments.\n");
         return -1;
     // Initialize our storage
-    cl_int *l_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
-    if (!l_bin_counts) {
-        log_error("add_index_bin_test FAILED to allocate initial values for bin_counters.\n");
+    cl_int *l_bin_counts = (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    if (!l_bin_counts)
+    {
+        log_error("add_index_bin_test FAILED to allocate initial values for "
+                  "bin_counters.\n");
         return -1;
     int i;
-    for (i=0; i<number_of_bins; i++)
-        l_bin_counts[i] = 0;
-    err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, l_bin_counts, 0, NULL, NULL);
-    if (err) {
-        log_error("add_index_bin_test FAILED to set initial values for bin_counters: %d\n", err);
+    for (i = 0; i < number_of_bins; i++) l_bin_counts[i] = 0;
+    err = clEnqueueWriteBuffer(queue, bin_counters, true, 0,
+                               sizeof(cl_int) * number_of_bins, l_bin_counts, 0,
+                               NULL, NULL);
+    if (err)
+    {
+        log_error("add_index_bin_test FAILED to set initial values for "
+                  "bin_counters: %d\n",
+                  err);
         return -1;
-    cl_int *values = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
-    if (!values) {
-        log_error("add_index_bin_test FAILED to allocate initial values for bins.\n");
+    cl_int *values =
+        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    if (!values)
+    {
+        log_error(
+            "add_index_bin_test FAILED to allocate initial values for bins.\n");
         return -1;
-    for (i=0; i<number_of_bins*max_counts_per_bin; i++)
-        values[i] = -1;
-    err = clEnqueueWriteBuffer(queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, values, 0, NULL, NULL);
-    if (err) {
-        log_error("add_index_bin_test FAILED to set initial values for bins: %d\n", err);
+    for (i = 0; i < number_of_bins * max_counts_per_bin; i++) values[i] = -1;
+    err = clEnqueueWriteBuffer(queue, bins, true, 0,
+                               sizeof(cl_int) * number_of_bins
+                                   * max_counts_per_bin,
+                               values, 0, NULL, NULL);
+    if (err)
+    {
+        log_error(
+            "add_index_bin_test FAILED to set initial values for bins: %d\n",
+            err);
         return -1;
-    cl_int *l_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_items);
-    if (!l_bin_assignments) {
-        log_error("add_index_bin_test FAILED to allocate initial values for l_bin_assignments.\n");
+    cl_int *l_bin_assignments =
+        (cl_int *)malloc(sizeof(cl_int) * number_of_items);
+    if (!l_bin_assignments)
+    {
+        log_error("add_index_bin_test FAILED to allocate initial values for "
+                  "l_bin_assignments.\n");
         return -1;
-    for (i=0; i<number_of_items; i++) {
-        int bin = random_in_range(0, number_of_bins-1, d);
-        while (l_bin_counts[bin] >= max_counts_per_bin) {
-            bin = random_in_range(0, number_of_bins-1, d);
+    for (i = 0; i < number_of_items; i++)
+    {
+        int bin = random_in_range(0, number_of_bins - 1, d);
+        while (l_bin_counts[bin] >= max_counts_per_bin)
+        {
+            bin = random_in_range(0, number_of_bins - 1, d);
         if (bin >= number_of_bins)
-            log_error("add_index_bin_test internal error generating bin assignments: bin %d >= number_of_bins %d.\n", bin, number_of_bins);
-        if (l_bin_counts[bin]+1 > max_counts_per_bin)
-            log_error("add_index_bin_test internal error generating bin assignments: bin %d has more entries (%d) than max_counts_per_bin (%d).\n", bin, l_bin_counts[bin], max_counts_per_bin);
+            log_error("add_index_bin_test internal error generating bin "
+                      "assignments: bin %d >= number_of_bins %d.\n",
+                      bin, number_of_bins);
+        if (l_bin_counts[bin] + 1 > max_counts_per_bin)
+            log_error(
+                "add_index_bin_test internal error generating bin assignments: "
+                "bin %d has more entries (%d) than max_counts_per_bin (%d).\n",
+                bin, l_bin_counts[bin], max_counts_per_bin);
         l_bin_assignments[i] = bin;
-        //     log_info("item %d assigned to bin %d (%d items)\n", i, bin, l_bin_counts[bin]);
+        //     log_info("item %d assigned to bin %d (%d items)\n", i, bin,
+        //     l_bin_counts[bin]);
-    err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, sizeof(cl_int)*number_of_items, l_bin_assignments, 0, NULL, NULL);
-    if (err) {
-        log_error("add_index_bin_test FAILED to set initial values for bin_assignments: %d\n", err);
+    err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0,
+                               sizeof(cl_int) * number_of_items,
+                               l_bin_assignments, 0, NULL, NULL);
+    if (err)
+    {
+        log_error("add_index_bin_test FAILED to set initial values for "
+                  "bin_assignments: %d\n",
+                  err);
         return -1;
     // Setup the kernel
     err = clSetKernelArg(kernel, 0, sizeof(bin_counters), &bin_counters);
     err |= clSetKernelArg(kernel, 1, sizeof(bins), &bins);
     err |= clSetKernelArg(kernel, 2, sizeof(bin_assignments), &bin_assignments);
-    err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), &max_counts_per_bin);
-    if (err) {
-        log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", err);
-        fail=1; succeed=-1;
+    err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin),
+                          &max_counts_per_bin);
+    if (err)
+    {
+        log_error("add_index_bin_test FAILED to set kernel arguments: %d\n",
+                  err);
+        fail = 1;
         return -1;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
-    if (err) {
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_threads,
+                                 local_threads, 0, NULL, NULL);
+    if (err)
+    {
         log_error("add_index_bin_test FAILED to execute kernel: %d\n", err);
-        fail=1; succeed=-1;
+        fail = 1;
-    cl_int *final_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
-    if (!final_bin_assignments) {
-        log_error("add_index_bin_test FAILED to allocate initial values for final_bin_assignments.\n");
+    cl_int *final_bin_assignments =
+        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    if (!final_bin_assignments)
+    {
+        log_error("add_index_bin_test FAILED to allocate initial values for "
+                  "final_bin_assignments.\n");
         return -1;
-    err = clEnqueueReadBuffer( queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, final_bin_assignments, 0, NULL, NULL );
-    if (err) {
+    err = clEnqueueReadBuffer(queue, bins, true, 0,
+                              sizeof(cl_int) * number_of_bins
+                                  * max_counts_per_bin,
+                              final_bin_assignments, 0, NULL, NULL);
+    if (err)
+    {
         log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
-        fail = 1; succeed=-1;
+        fail = 1;
-    cl_int *final_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
-    if (!final_bin_counts) {
-        log_error("add_index_bin_test FAILED to allocate initial values for final_bin_counts.\n");
+    cl_int *final_bin_counts =
+        (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    if (!final_bin_counts)
+    {
+        log_error("add_index_bin_test FAILED to allocate initial values for "
+                  "final_bin_counts.\n");
         return -1;
-    err = clEnqueueReadBuffer( queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, final_bin_counts, 0, NULL, NULL );
-    if (err) {
-        log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", err);
-        fail = 1; succeed=-1;
+    err = clEnqueueReadBuffer(queue, bin_counters, true, 0,
+                              sizeof(cl_int) * number_of_bins, final_bin_counts,
+                              0, NULL, NULL);
+    if (err)
+    {
+        log_error("add_index_bin_test FAILED to read back bin_counters: %d\n",
+                  err);
+        fail = 1;
     // Verification.
-    int errors=0;
+    int errors = 0;
     int current_bin;
     int search;
     //  Print out all the contents of the bins.
     //  for (current_bin=0; current_bin<number_of_bins; current_bin++)
     //        for (search=0; search<max_counts_per_bin; search++)
-    //      log_info("[bin %d, entry %d] = %d\n", current_bin, search, final_bin_assignments[current_bin*max_counts_per_bin+search]);
+    //      log_info("[bin %d, entry %d] = %d\n", current_bin, search,
+    //      final_bin_assignments[current_bin*max_counts_per_bin+search]);
     // First verify that there are the correct number in each bin.
-    for (current_bin=0; current_bin<number_of_bins; current_bin++) {
+    for (current_bin = 0; current_bin < number_of_bins; current_bin++)
+    {
         int expected_number = l_bin_counts[current_bin];
         int actual_number = final_bin_counts[current_bin];
-        if (expected_number != actual_number) {
-            log_error("add_index_bin_test FAILED: bin %d reported %d entries when %d were expected.\n", current_bin, actual_number, expected_number);
+        if (expected_number != actual_number)
+        {
+            log_error("add_index_bin_test FAILED: bin %d reported %d entries "
+                      "when %d were expected.\n",
+                      current_bin, actual_number, expected_number);
-        for (search=0; search<expected_number; search++) {
-            if (final_bin_assignments[current_bin*max_counts_per_bin+search] == -1) {
-                log_error("add_index_bin_test FAILED: bin %d had no entry at position %d when it should have had %d entries.\n", current_bin, search, expected_number);
+        for (search = 0; search < expected_number; search++)
+        {
+            if (final_bin_assignments[current_bin * max_counts_per_bin + search]
+                == -1)
+            {
+                log_error("add_index_bin_test FAILED: bin %d had no entry at "
+                          "position %d when it should have had %d entries.\n",
+                          current_bin, search, expected_number);
-        for (search=expected_number; search<max_counts_per_bin; search++) {
-            if (final_bin_assignments[current_bin*max_counts_per_bin+search] != -1) {
-                log_error("add_index_bin_test FAILED: bin %d had an extra entry at position %d when it should have had only %d entries.\n", current_bin, search, expected_number);
+        for (search = expected_number; search < max_counts_per_bin; search++)
+        {
+            if (final_bin_assignments[current_bin * max_counts_per_bin + search]
+                != -1)
+            {
+                log_error(
+                    "add_index_bin_test FAILED: bin %d had an extra entry at "
+                    "position %d when it should have had only %d entries.\n",
+                    current_bin, search, expected_number);
     // Now verify that the correct ones are in each bin
     int index;
-    for (index=0; index<number_of_items; index++) {
+    for (index = 0; index < number_of_items; index++)
+    {
         int expected_bin = l_bin_assignments[index];
         int found_it = 0;
-        for (search=0; search<l_bin_counts[expected_bin]; search++) {
-            if (final_bin_assignments[expected_bin*max_counts_per_bin+search] == index) {
+        for (search = 0; search < l_bin_counts[expected_bin]; search++)
+        {
+            if (final_bin_assignments[expected_bin * max_counts_per_bin
+                                      + search]
+                == index)
+            {
                 found_it = 1;
-        if (found_it == 0) {
-            log_error("add_index_bin_test FAILED: did not find item %d in bin %d.\n", index, expected_bin);
+        if (found_it == 0)
+        {
+            log_error(
+                "add_index_bin_test FAILED: did not find item %d in bin %d.\n",
+                index, expected_bin);
@@ -341,41 +469,49 @@
-    if (errors == 0) {
-        log_info("add_index_bin_test passed. Each item was put in the correct bin in parallel.\n");
+    if (errors == 0)
+    {
+        log_info("add_index_bin_test passed. Each item was put in the correct "
+                 "bin in parallel.\n");
         return 0;
-    } else {
+    }
+    else
+    {
         log_error("add_index_bin_test FAILED: %d errors.\n", errors);
         return -1;
-int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
     //===== add_index_bin test
     size_t numGlobalThreads = 2048;
-    int iteration=0;
+    int iteration = 0;
     int err, failed = 0;
-    MTdata d = init_genrand( gRandomSeed );
+    MTdata d = init_genrand(gRandomSeed);
-  /* Check if atomics are supported. */
-  if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
-    log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
-    free_mtdata( d );
-    return 0;
-  }
+    /* Check if atomics are supported. */
+    if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics"))
+    {
+        log_info("Base atomics not supported "
+                 "(cl_khr_global_int32_base_atomics). Skipping test.\n");
+        free_mtdata(d);
+        return 0;
+    }
-    for(iteration=0; iteration<10; iteration++) {
-        log_info("add_index_bin_test with %d elements:\n", (int)numGlobalThreads);
-        err = add_index_bin_test(&numGlobalThreads,  queue,  context, d);
-        if (err) {
+    for (iteration = 0; iteration < 10; iteration++)
+    {
+        log_info("add_index_bin_test with %d elements:\n",
+                 (int)numGlobalThreads);
+        err = add_index_bin_test(&numGlobalThreads, queue, context, d);
+        if (err)
+        {
-        numGlobalThreads*=2;
+        numGlobalThreads *= 2;
-    free_mtdata( d );
+    free_mtdata(d);
     return failed;
diff --git a/test_conformance/basic/test_arraycopy.cpp b/test_conformance/basic/test_arraycopy.cpp
index 5a35286..d9dbcc1 100644
--- a/test_conformance/basic/test_arraycopy.cpp
+++ b/test_conformance/basic/test_arraycopy.cpp
@@ -181,9 +181,8 @@
-  // Keep track of multiple errors.
-  if (error_count != 0)
-    err = error_count;
+    // Keep track of multiple errors.
+    if (error_count != 0) err = error_count;
     if (err)
         log_error("\tCL_MEM_USE_HOST_PTR buffer with kernel copy FAILED\n");
diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
index 9fbdcb6..bf3f155 100644
--- a/test_conformance/basic/test_async_copy2D.cpp
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -25,77 +25,81 @@
 #include "../../test_common/harness/conversions.h"
 #include "procs.h"
-static const char *async_global_to_local_kernel2D =
-    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
-    "%s\n" // optional pragma string
-    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
-    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
-    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
-    "{\n"
-    " int i, j;\n"
-    // Zero the local storage first
-    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numElementsPerLine; j++)\n"
-    "     localBuffer[ (get_local_id( 0 "
-    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
-    "(%s)(%s)0;\n"
-    // Do this to verify all kernels are done zeroing the local buffer before we
-    // try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "    event_t event;\n"
-    "    event = async_work_group_copy_2D2D( (__local %s*)localBuffer, "
-    "(__global const "
-    "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
-    "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, "
-    "srcStride, dstStride, 0 );\n"
-    // Wait for the copy to complete, then verify by manually copying to the
-    // dest
-    "     wait_group_events( 1, &event );\n"
-    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numElementsPerLine; j++)\n"
-    "     dst[ (get_global_id( 0 "
-    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
-    "localBuffer[ (get_local_id( 0 "
-    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n"
-    "}\n";
+static const char *async_global_to_local_kernel2D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
-static const char *async_local_to_global_kernel2D =
-    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
-    "%s\n" // optional pragma string
-    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
-    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
-    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
-    "{\n"
-    " int i, j;\n"
-    // Zero the local storage first
-    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numElementsPerLine; j++)\n"
-    "     localBuffer[ (get_local_id( 0 "
-    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = "
-    "(%s)(%s)0;\n"
-    // Do this to verify all kernels are done zeroing the local buffer before we
-    // try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numElementsPerLine; j++)\n"
-    "     localBuffer[ (get_local_id( 0 "
-    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ "
-    "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + "
-    "srcStride)+j ];\n"
-    // Do this to verify all kernels are done copying to the local buffer before
-    // we try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "    event_t event;\n"
-    "    event = async_work_group_copy_2D2D((__global "
-    "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
-    "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, "
-    "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n"
-    "    wait_group_events( 1, &event );\n"
-    "}\n";
+__kernel void test_fn(const __global %s *src, __global %s *dst,
+                      __local %s *localBuffer, int numElementsPerLine,
+                      int lineCopiesPerWorkgroup, int lineCopiesPerWorkItem,
+                      int srcStride, int dstStride) {
+  // Zero the local storage first
+  for (int i = 0; i < lineCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numElementsPerLine; j++) {
+      const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
+      localBuffer[index] = (%s)(%s)0;
+    }
+  }
+  // Do this to verify all kernels are done zeroing the local buffer before we
+  // try the copy
+  barrier( CLK_LOCAL_MEM_FENCE );
+  event_t event = async_work_group_copy_2D2D(localBuffer, 0, src,
+    lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(%s),
+    (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0);
+  // Wait for the copy to complete, then verify by manually copying to the dest
+  wait_group_events(1, &event);
+  for (int i = 0; i < lineCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numElementsPerLine; j++) {
+      const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
+      const int global_index = (get_global_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
+      dst[global_index] = localBuffer[local_index];
+    }
+  }
+static const char *async_local_to_global_kernel2D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
+__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+                      int numElementsPerLine, int lineCopiesPerWorkgroup,
+                      int lineCopiesPerWorkItem, int srcStride, int dstStride) {
+  // Zero the local storage first
+  for (int i = 0; i < lineCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numElementsPerLine; j++) {
+      const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
+      localBuffer[index] = (%s)(%s)0;
+    }
+  }
+  // Do this to verify all kernels are done zeroing the local buffer before we try the copy
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = 0; i < lineCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numElementsPerLine; j++) {
+      const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
+      const int global_index = (get_global_id(0)*lineCopiesPerWorkItem + i) * srcStride + j;
+      localBuffer[local_index] = src[global_index];
+    }
+  }
+  // Do this to verify all kernels are done copying to the local buffer before we try the copy
+  barrier(CLK_LOCAL_MEM_FENCE);
+  event_t event = async_work_group_copy_2D2D(dst, lineCopiesPerWorkgroup * get_group_id(0) * dstStride,
+    localBuffer, 0, sizeof(%s), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride,
+   dstStride, 0 );
+  wait_group_events(1, &event);
 int test_copy2D(cl_device_id deviceID, cl_context context,
                 cl_command_queue queue, const char *kernelCode,
-                ExplicitType vecType, int vecSize, int srcStride, int dstStride,
+                ExplicitType vecType, int vecSize, int srcMargin, int dstMargin,
                 bool localIsDst)
     int error;
@@ -114,8 +118,8 @@
     size_t elementSize = get_explicit_type_size(vecType) * vecSize;
-    log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
-             srcStride, dstStride);
+    log_info("Testing %s with srcMargin = %d, dstMargin = %d\n", vecNameString,
+             srcMargin, dstMargin);
     cl_long max_local_mem_size;
     error =
@@ -153,7 +157,7 @@
             vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
                                : "",
             vecNameString, vecNameString, vecNameString, vecNameString,
-            get_explicit_type_name(vecType), vecNameString, vecNameString);
+            get_explicit_type_name(vecType), vecNameString);
     // log_info("program: %s\n", programSource);
     programPtr = programSource;
@@ -180,12 +184,17 @@
     if (max_workgroup_size > max_local_workgroup_size[0])
         max_workgroup_size = max_local_workgroup_size[0];
-    size_t numElementsPerLine = 10;
-    size_t lineCopiesPerWorkItem = 13;
+    const size_t numElementsPerLine = 10;
+    const cl_int dstStride = numElementsPerLine + dstMargin;
+    const cl_int srcStride = numElementsPerLine + srcMargin;
     elementSize =
         get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
-    size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize
-        * (numElementsPerLine + (localIsDst ? dstStride : srcStride));
+    const size_t lineCopiesPerWorkItem = 13;
+    const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem
+        * elementSize * (localIsDst ? dstStride : srcStride);
     size_t maxLocalWorkgroupSize =
         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
@@ -199,34 +208,39 @@
     if (maxLocalWorkgroupSize > max_workgroup_size)
         localWorkgroupSize = max_workgroup_size;
-    size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride)
-        / (numElementsPerLine + srcStride);
-    size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
-        / (numElementsPerLine + dstStride);
-    size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
-    size_t maxLocalWorkgroups =
+    const size_t maxTotalLinesIn =
+        (max_alloc_size / elementSize + srcMargin) / srcStride;
+    const size_t maxTotalLinesOut =
+        (max_alloc_size / elementSize + dstMargin) / dstStride;
+    const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut);
+    const size_t maxLocalWorkgroups =
         maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
-    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
-        - (localIsDst ? dstStride : srcStride);
-    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
-    size_t totalLines =
+    const size_t localBufferSize =
+        localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstMargin : srcMargin);
+    const size_t numberOfLocalWorkgroups =
+        std::min(1111, (int)maxLocalWorkgroups);
+    const size_t totalLines =
         numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
-    size_t inBufferSize = elementSize
-        * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride);
-    size_t outBufferSize = elementSize
-        * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride);
-    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+    const size_t inBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin);
+    const size_t outBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin);
+    const size_t globalWorkgroupSize =
+        numberOfLocalWorkgroups * localWorkgroupSize;
     inBuffer = (void *)malloc(inBufferSize);
     outBuffer = (void *)malloc(outBufferSize);
     outBufferCopy = (void *)malloc(outBufferSize);
-    cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt,
-        lineCopiesPerWorkgroup;
-    lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem;
-    numElementsPerLineInt = (int)numElementsPerLine;
-    lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize);
+    const cl_int lineCopiesPerWorkItemInt =
+        static_cast<cl_int>(lineCopiesPerWorkItem);
+    const cl_int numElementsPerLineInt =
+        static_cast<cl_int>(numElementsPerLine);
+    const cl_int lineCopiesPerWorkgroup =
+        static_cast<cl_int>(lineCopiesPerWorkItem * localWorkgroupSize);
         "Global: %d, local %d, local buffer %db, global in buffer %db, "
@@ -296,8 +310,8 @@
         for (int j = 0; j < (int)numElementsPerLine * elementSize;
              j += elementSize)
-            int inIdx = i * (numElementsPerLine + srcStride) + j;
-            int outIdx = i * (numElementsPerLine + dstStride) + j;
+            int inIdx = i * srcStride + j;
+            int outIdx = i * dstStride + j;
             if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
                 != 0)
@@ -332,11 +346,10 @@
         if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
                 * elementSize)
-            int outIdx = i * (numElementsPerLine + dstStride)
-                + numElementsPerLine * elementSize;
+            int outIdx = i * dstStride + numElementsPerLine * elementSize;
             if (memcmp(((char *)outBuffer) + outIdx,
                        ((char *)outBufferCopy) + outIdx,
-                       dstStride * elementSize)
+                       dstMargin * elementSize)
                 != 0)
                 if (failuresPrinted == 0)
@@ -373,9 +386,12 @@
         kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
         kULong, kFloat, kDouble, kNumExplicitTypes
+    // The margins below represent the number of elements between the end of
+    // one line and the start of the next. The strides are equivalent to the
+    // length of the line plus the chosen margin.
     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
-    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
-    unsigned int size, typeIndex, srcStride, dstStride;
+    unsigned int smallTypesMarginSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcMargin, dstMargin;
     int errors = 0;
@@ -401,19 +417,19 @@
             if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
                 <= 2) // small type
-                for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes)
-                         / sizeof(smallTypesStrideSizes[0]);
-                     srcStride++)
+                for (srcMargin = 0; srcMargin < sizeof(smallTypesMarginSizes)
+                         / sizeof(smallTypesMarginSizes[0]);
+                     srcMargin++)
-                    for (dstStride = 0;
-                         dstStride < sizeof(smallTypesStrideSizes)
-                             / sizeof(smallTypesStrideSizes[0]);
-                         dstStride++)
+                    for (dstMargin = 0;
+                         dstMargin < sizeof(smallTypesMarginSizes)
+                             / sizeof(smallTypesMarginSizes[0]);
+                         dstMargin++)
                         if (test_copy2D(deviceID, context, queue, kernelCode,
                                         vecType[typeIndex], vecSizes[size],
-                                        smallTypesStrideSizes[srcStride],
-                                        smallTypesStrideSizes[dstStride],
+                                        smallTypesMarginSizes[srcMargin],
+                                        smallTypesMarginSizes[dstMargin],
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
index 252159b..5eb41eb 100644
--- a/test_conformance/basic/test_async_copy3D.cpp
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -25,96 +25,95 @@
 #include "../../test_common/harness/conversions.h"
 #include "procs.h"
-static const char *async_global_to_local_kernel3D =
-    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
-    "%s\n" // optional pragma string
-    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
-    "%s *localBuffer, int numElementsPerLine, int numLines, int "
-    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
-    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
-    "{\n"
-    " int i, j, k;\n"
-    // Zero the local storage first
-    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numLines; j++)\n"
-    "     for(k=0; k<numElementsPerLine; k++)\n"
-    "       localBuffer[ (get_local_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
-    "dstLineStride) + k ] = (%s)(%s)0;\n"
-    // Do this to verify all kernels are done zeroing the local buffer before we
-    // try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "    event_t event;\n"
-    "    event = async_work_group_copy_3D3D( (__local %s*)localBuffer, "
-    "(__global const "
-    "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
-    "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), "
-    "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, "
-    "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, "
-    "0 );\n"
-    // Wait for the copy to complete, then verify by manually copying to the
-    // dest
-    " wait_group_events( 1, &event );\n"
-    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numLines; j++)\n"
-    "     for(k=0; k<numElementsPerLine; k++)\n"
-    "       dst[ (get_global_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
-    "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
-    "dstLineStride) + k ];\n"
-    "}\n";
+static const char *async_global_to_local_kernel3D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
-static const char *async_local_to_global_kernel3D =
-    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
-    "%s\n" // optional pragma string
-    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
-    "%s *localBuffer, int numElementsPerLine, int numLines, int "
-    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
-    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
-    "{\n"
-    " int i, j, k;\n"
-    // Zero the local storage first
-    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numLines; j++)\n"
-    "     for(k=0; k<numElementsPerLine; k++)\n"
-    "       localBuffer[ (get_local_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
-    "srcLineStride) + k ] = (%s)(%s)0;\n"
-    // Do this to verify all kernels are done zeroing the local buffer before we
-    // try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
-    "   for(j=0; j<numLines; j++)\n"
-    "     for(k=0; k<numElementsPerLine; k++)\n"
-    "       localBuffer[ (get_local_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
-    "srcLineStride) + k ] = src[ (get_global_id( 0 "
-    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
-    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
-    "srcLineStride) + k ];\n"
-    // Do this to verify all kernels are done copying to the local buffer before
-    // we try the copy
-    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "    event_t event;\n"
-    "    event = async_work_group_copy_3D3D((__global "
-    "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
-    "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local "
-    "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, "
-    "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, "
-    "dstPlaneStride, 0 );\n"
-    "    wait_group_events( 1, &event );\n"
-    "}\n";
+__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+                      int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
+                      int planesCopiesPerWorkItem, int srcLineStride,
+                      int dstLineStride, int srcPlaneStride, int dstPlaneStride ) {
+  // Zero the local storage first
+  for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numLines; j++) {
+      for (int k = 0; k < numElementsPerLine; k++) {
+        const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+        localBuffer[index] = (%s)(%s)0;
+      }
+    }
+  }
+  // Do this to verify all kernels are done zeroing the local buffer before we try the copy
+  barrier(CLK_LOCAL_MEM_FENCE);
+  event_t event = async_work_group_copy_3D3D(localBuffer, 0, src,
+    planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride,
+    sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines,
+    planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride,
+    dstPlaneStride, 0);
+  // Wait for the copy to complete, then verify by manually copying to the dest
+  wait_group_events(1, &event);
+  for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numLines; j++) {
+      for(int k = 0; k < numElementsPerLine; k++) {
+        const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+        const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+        dst[global_index] = localBuffer[local_index];
+      }
+    }
+  }
+static const char *async_local_to_global_kernel3D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
+__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+                      int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
+                      int planesCopiesPerWorkItem, int srcLineStride,
+                      int dstLineStride, int srcPlaneStride, int dstPlaneStride) {
+  // Zero the local storage first
+  for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+    for (int j = 0; j < numLines; j++) {
+      for (int k = 0; k < numElementsPerLine; k++) {
+        const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
+        localBuffer[index] = (%s)(%s)0;
+      }
+    }
+  }
+  // Do this to verify all kernels are done zeroing the local buffer before we try the copy
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i=0; i < planesCopiesPerWorkItem; i++) {
+    for (int j=0; j < numLines; j++) {
+      for (int k=0; k < numElementsPerLine; k++) {
+        const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
+        const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k;
+        localBuffer[local_index] = src[global_index];
+      }
+    }
+  }
+  // Do this to verify all kernels are done copying to the local buffer before we try the copy
+  barrier(CLK_LOCAL_MEM_FENCE);
+  event_t event = async_work_group_copy_3D3D(dst,
+    planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0,
+    sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
+    srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0);
+  wait_group_events(1, &event);
 int test_copy3D(cl_device_id deviceID, cl_context context,
                 cl_command_queue queue, const char *kernelCode,
-                ExplicitType vecType, int vecSize, int srcLineStride,
-                int dstLineStride, int srcPlaneStride, int dstPlaneStride,
+                ExplicitType vecType, int vecSize, int srcLineMargin,
+                int dstLineMargin, int srcPlaneMargin, int dstPlaneMargin,
                 bool localIsDst)
     int error;
@@ -133,10 +132,10 @@
     size_t elementSize = get_explicit_type_size(vecType) * vecSize;
-    log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, "
-             "srcPlaneStride = %d, dstPlaneStride = %d\n",
-             vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
-             dstPlaneStride);
+    log_info("Testing %s with srcLineMargin = %d, dstLineMargin = %d, "
+             "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
+             vecNameString, srcLineMargin, dstLineMargin, srcPlaneMargin,
+             dstPlaneMargin);
     cl_long max_local_mem_size;
     error =
@@ -201,16 +200,20 @@
     if (max_workgroup_size > max_local_workgroup_size[0])
         max_workgroup_size = max_local_workgroup_size[0];
-    size_t numElementsPerLine = 10;
-    size_t numLines = 13;
-    size_t planesCopiesPerWorkItem = 2;
+    const size_t numElementsPerLine = 10;
+    const cl_int dstLineStride = numElementsPerLine + dstLineMargin;
+    const cl_int srcLineStride = numElementsPerLine + srcLineMargin;
+    const size_t numLines = 13;
+    const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin;
+    const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin;
     elementSize =
         get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
-    size_t localStorageSpacePerWorkitem = elementSize
-        * (planesCopiesPerWorkItem
-           * (numLines * numElementsPerLine
-              + numLines * (localIsDst ? dstLineStride : srcLineStride)
-              + (localIsDst ? dstPlaneStride : srcPlaneStride)));
+    const size_t planesCopiesPerWorkItem = 2;
+    const size_t localStorageSpacePerWorkitem = elementSize
+        * planesCopiesPerWorkItem
+        * (localIsDst ? dstPlaneStride : srcPlaneStride);
     size_t maxLocalWorkgroupSize =
         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
@@ -224,42 +227,41 @@
     if (maxLocalWorkgroupSize > max_workgroup_size)
         localWorkgroupSize = max_workgroup_size;
-    size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride)
-        / ((numLines * numElementsPerLine + numLines * srcLineStride)
-           + srcPlaneStride);
-    size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
-        / ((numLines * numElementsPerLine + numLines * dstLineStride)
-           + dstPlaneStride);
-    size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
-    size_t maxLocalWorkgroups =
+    const size_t maxTotalPlanesIn =
+        ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride;
+    const size_t maxTotalPlanesOut =
+        ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride;
+    const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut);
+    const size_t maxLocalWorkgroups =
         maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
-    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
-        - (localIsDst ? dstPlaneStride : srcPlaneStride);
-    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
-    size_t totalPlanes =
+    const size_t localBufferSize =
+        localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstPlaneMargin : srcPlaneMargin);
+    const size_t numberOfLocalWorkgroups =
+        std::min(1111, (int)maxLocalWorkgroups);
+    const size_t totalPlanes =
         numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
-    size_t inBufferSize = elementSize
-        * (totalPlanes
-               * (numLines * numElementsPerLine + numLines * srcLineStride)
-           + (totalPlanes - 1) * srcPlaneStride);
-    size_t outBufferSize = elementSize
-        * (totalPlanes
-               * (numLines * numElementsPerLine + numLines * dstLineStride)
-           + (totalPlanes - 1) * dstPlaneStride);
-    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+    const size_t inBufferSize = elementSize
+        * (totalPlanes * numLines * srcLineStride
+           + (totalPlanes - 1) * srcPlaneMargin);
+    const size_t outBufferSize = elementSize
+        * (totalPlanes * numLines * dstLineStride
+           + (totalPlanes - 1) * dstPlaneMargin);
+    const size_t globalWorkgroupSize =
+        numberOfLocalWorkgroups * localWorkgroupSize;
     inBuffer = (void *)malloc(inBufferSize);
     outBuffer = (void *)malloc(outBufferSize);
     outBufferCopy = (void *)malloc(outBufferSize);
-    cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt,
-        planesCopiesPerWorkgroup;
-    planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem;
-    numElementsPerLineInt = (int)numElementsPerLine;
-    numLinesInt = (int)numLines;
-    planesCopiesPerWorkgroup =
-        (int)(planesCopiesPerWorkItem * localWorkgroupSize);
+    const cl_int planesCopiesPerWorkItemInt =
+        static_cast<cl_int>(planesCopiesPerWorkItem);
+    const cl_int numElementsPerLineInt =
+        static_cast<cl_int>(numElementsPerLine);
+    const cl_int numLinesInt = static_cast<cl_int>(numLines);
+    const cl_int planesCopiesPerWorkgroup =
+        static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize);
     log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
              "global out buffer %db, each work group will copy %d planes and "
@@ -336,14 +338,8 @@
             for (int k = 0; k < (int)numElementsPerLine * elementSize;
                  k += elementSize)
-                int inIdx = i
-                        * (numLines * numElementsPerLine
-                           + numLines * srcLineStride + srcPlaneStride)
-                    + j * (numElementsPerLine + srcLineStride) + k;
-                int outIdx = i
-                        * (numLines * numElementsPerLine
-                           + numLines * dstLineStride + dstPlaneStride)
-                    + j * (numElementsPerLine + dstLineStride) + k;
+                int inIdx = i * srcPlaneStride + j * srcLineStride + k;
+                int outIdx = i * dstPlaneStride + j * dstLineStride + k;
                 if (memcmp(((char *)inBuffer) + inIdx,
                            ((char *)outBuffer) + outIdx, typeSize)
                     != 0)
@@ -378,14 +374,11 @@
             if (j < (int)numLines * elementSize)
-                int outIdx = i
-                        * (numLines * numElementsPerLine
-                           + numLines * dstLineStride + dstPlaneStride)
-                    + j * (numElementsPerLine + dstLineStride)
+                int outIdx = i * dstPlaneStride + j * dstLineStride
                     + numElementsPerLine * elementSize;
                 if (memcmp(((char *)outBuffer) + outIdx,
                            ((char *)outBufferCopy) + outIdx,
-                           dstLineStride * elementSize)
+                           dstLineMargin * elementSize)
                     != 0)
                     if (failuresPrinted == 0)
@@ -409,14 +402,11 @@
         if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
                 * elementSize)
-            int outIdx = i
-                    * (numLines * numElementsPerLine + numLines * dstLineStride
-                       + dstPlaneStride)
-                + (numLines * elementSize) * (numElementsPerLine)
-                + (numLines * elementSize) * (dstLineStride);
+            int outIdx =
+                i * dstPlaneStride + numLines * dstLineStride * elementSize;
             if (memcmp(((char *)outBuffer) + outIdx,
                        ((char *)outBufferCopy) + outIdx,
-                       dstPlaneStride * elementSize)
+                       dstPlaneMargin * elementSize)
                 != 0)
                 if (failuresPrinted == 0)
@@ -453,10 +443,13 @@
         kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
         kULong, kFloat, kDouble, kNumExplicitTypes
+    // The margins below represent the number of elements between the end of
+    // one line or plane and the start of the next. The strides are equivalent
+    // to the size of the line or plane plus the chosen margin.
     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
-    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
-    unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride,
-        dstPlaneStride;
+    unsigned int smallTypesMarginSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin,
+        dstPlaneMargin;
     int errors = 0;
@@ -482,33 +475,33 @@
             if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
                 <= 2) // small type
-                for (srcLineStride = 0;
-                     srcLineStride < sizeof(smallTypesStrideSizes)
-                         / sizeof(smallTypesStrideSizes[0]);
-                     srcLineStride++)
+                for (srcLineMargin = 0;
+                     srcLineMargin < sizeof(smallTypesMarginSizes)
+                         / sizeof(smallTypesMarginSizes[0]);
+                     srcLineMargin++)
-                    for (dstLineStride = 0;
-                         dstLineStride < sizeof(smallTypesStrideSizes)
-                             / sizeof(smallTypesStrideSizes[0]);
-                         dstLineStride++)
+                    for (dstLineMargin = 0;
+                         dstLineMargin < sizeof(smallTypesMarginSizes)
+                             / sizeof(smallTypesMarginSizes[0]);
+                         dstLineMargin++)
-                        for (srcPlaneStride = 0;
-                             srcPlaneStride < sizeof(smallTypesStrideSizes)
-                                 / sizeof(smallTypesStrideSizes[0]);
-                             srcPlaneStride++)
+                        for (srcPlaneMargin = 0;
+                             srcPlaneMargin < sizeof(smallTypesMarginSizes)
+                                 / sizeof(smallTypesMarginSizes[0]);
+                             srcPlaneMargin++)
-                            for (dstPlaneStride = 0;
-                                 dstPlaneStride < sizeof(smallTypesStrideSizes)
-                                     / sizeof(smallTypesStrideSizes[0]);
-                                 dstPlaneStride++)
+                            for (dstPlaneMargin = 0;
+                                 dstPlaneMargin < sizeof(smallTypesMarginSizes)
+                                     / sizeof(smallTypesMarginSizes[0]);
+                                 dstPlaneMargin++)
                                 if (test_copy3D(
                                         deviceID, context, queue, kernelCode,
                                         vecType[typeIndex], vecSizes[size],
-                                        smallTypesStrideSizes[srcLineStride],
-                                        smallTypesStrideSizes[dstLineStride],
-                                        smallTypesStrideSizes[srcPlaneStride],
-                                        smallTypesStrideSizes[dstPlaneStride],
+                                        smallTypesMarginSizes[srcLineMargin],
+                                        smallTypesMarginSizes[dstLineMargin],
+                                        smallTypesMarginSizes[srcPlaneMargin],
+                                        smallTypesMarginSizes[dstPlaneMargin],
diff --git a/test_conformance/basic/test_enqueue_map.cpp b/test_conformance/basic/test_enqueue_map.cpp
index 3702726..d28f7e4 100644
--- a/test_conformance/basic/test_enqueue_map.cpp
+++ b/test_conformance/basic/test_enqueue_map.cpp
@@ -146,7 +146,7 @@
     clMemWrapper memObject;
     log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
-    generate_random_data(kUInt, (unsigned int)(imageSize * imageSize), d,
+    generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4), d,
     memcpy(referenceData, hostPtrData, imageDataSize);
diff --git a/test_conformance/basic/test_enqueued_local_size.cpp b/test_conformance/basic/test_enqueued_local_size.cpp
index f52162a..ea95df6 100644
--- a/test_conformance/basic/test_enqueued_local_size.cpp
+++ b/test_conformance/basic/test_enqueued_local_size.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -14,42 +14,45 @@
 // limitations under the License.
 #include "harness/compat.h"
+#include "harness/rounding_mode.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#include "harness/rounding_mode.h"
+#include <algorithm>
 #include "procs.h"
-static const char *enqueued_local_size_2d_code =
-"__kernel void test_enqueued_local_size_2d(global int *dst)\n"
-"    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))\n"
-"    {\n"
-"        dst[0] = (int)get_enqueued_local_size(0)\n;"
-"        dst[1] = (int)get_enqueued_local_size(1)\n;"
-"    }\n"
+static const char *enqueued_local_size_2d_code = R"(
+__kernel void test_enqueued_local_size_2d(global int *dst)
+    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))
+    {
+        dst[0] = (int)get_enqueued_local_size(0);
+        dst[1] = (int)get_enqueued_local_size(1);
+    }
-static const char *enqueued_local_size_1d_code =
-"__kernel void test_enqueued_local_size_1d(global int *dst)\n"
-"    int  tid_x = get_global_id(0);\n"
-"    if (get_global_id(0) == 0)\n"
-"    {\n"
-"        dst[tid_x] = (int)get_enqueued_local_size(0)\n;"
-"    }\n"
+static const char *enqueued_local_size_1d_code = R"(
+__kernel void test_enqueued_local_size_1d(global int *dst)
+    int  tid_x = get_global_id(0);
+    if (get_global_id(0) == 0)
+    {
+        dst[tid_x] = (int)get_enqueued_local_size(0);
+    }
-static int
-verify_enqueued_local_size(int *result, size_t *expected, int n)
+static int verify_enqueued_local_size(int *result, size_t *expected, int n)
     int i;
-    for (i=0; i<n; i++)
+    for (i = 0; i < n; i++)
         if (result[i] != (int)expected[i])
@@ -62,14 +65,14 @@
-test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueued_local_size(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements)
-    cl_mem streams;
-    cl_program program[2];
-    cl_kernel kernel[2];
+    clMemWrapper stream;
+    clProgramWrapper program[2];
+    clKernelWrapper kernel[2];
-    int *output_ptr;
+    cl_int output_ptr[2];
     size_t globalsize[2];
     size_t localsize[2];
     int err;
@@ -95,37 +98,36 @@
-    output_ptr   = (int*)malloc(2 * sizeof(int));
-    streams =
-        clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * sizeof(int), NULL, &err);
-    test_error( err, "clCreateBuffer failed.");
+    stream = clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * sizeof(cl_int),
+                            nullptr, &err);
+    test_error(err, "clCreateBuffer failed.");
     std::string cl_std = "-cl-std=CL";
     cl_std += (get_device_cl_version(device) == Version(3, 0)) ? "3.0" : "2.0";
     err = create_single_kernel_helper_with_build_options(
         context, &program[0], &kernel[0], 1, &enqueued_local_size_1d_code,
         "test_enqueued_local_size_1d", cl_std.c_str());
-    test_error( err, "create_single_kernel_helper failed");
+    test_error(err, "create_single_kernel_helper failed");
     err = create_single_kernel_helper_with_build_options(
         context, &program[1], &kernel[1], 1, &enqueued_local_size_2d_code,
         "test_enqueued_local_size_2d", cl_std.c_str());
-    test_error( err, "create_single_kernel_helper failed");
+    test_error(err, "create_single_kernel_helper failed");
-    err  = clSetKernelArg(kernel[0], 0, sizeof streams, &streams);
-    test_error( err, "clSetKernelArgs failed.");
-    err  = clSetKernelArg(kernel[1], 0, sizeof streams, &streams);
-    test_error( err, "clSetKernelArgs failed.");
+    err = clSetKernelArg(kernel[0], 0, sizeof stream, &stream);
+    test_error(err, "clSetKernelArgs failed.");
+    err = clSetKernelArg(kernel[1], 0, sizeof stream, &stream);
+    test_error(err, "clSetKernelArgs failed.");
-    globalsize[0] = (size_t)num_elements;
-    globalsize[1] = (size_t)num_elements;
+    globalsize[0] = static_cast<size_t>(num_elements);
+    globalsize[1] = static_cast<size_t>(num_elements);
     size_t max_wgs;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_wgs), &max_wgs, NULL);
-    test_error( err, "clGetDeviceInfo failed.");
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                          sizeof(max_wgs), &max_wgs, nullptr);
+    test_error(err, "clGetDeviceInfo failed.");
-    localsize[0] = MIN(16, max_wgs);
-    localsize[1] = MIN(11, max_wgs / localsize[0]);
+    localsize[0] = std::min<size_t>(16, max_wgs);
+    localsize[1] = std::min<size_t>(11, max_wgs / localsize[0]);
     // If we need to use uniform workgroups because non-uniform workgroups are
     // not supported, round up to the next global size that is divisible by the
     // local size.
@@ -141,35 +143,31 @@
-    err = clEnqueueNDRangeKernel(queue, kernel[1], 2, NULL, globalsize, localsize, 0, NULL, NULL);
-    test_error( err, "clEnqueueNDRangeKernel failed.");
+    err = clEnqueueNDRangeKernel(queue, kernel[1], 2, nullptr, globalsize,
+                                 localsize, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed.");
-    err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL);
-    test_error( err, "clEnqueueReadBuffer failed.");
+    err = clEnqueueReadBuffer(queue, stream, CL_BLOCKING, 0, 2 * sizeof(int),
+                              output_ptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed.");
     err = verify_enqueued_local_size(output_ptr, localsize, 2);
-    globalsize[0] = (size_t)num_elements;
+    globalsize[0] = static_cast<size_t>(num_elements);
     localsize[0] = 9;
     if (use_uniform_work_groups && (globalsize[0] % localsize[0]))
         globalsize[0] += (localsize[0] - (globalsize[0] % localsize[0]));
-    err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, globalsize, localsize, 0, NULL, NULL);
-    test_error( err, "clEnqueueNDRangeKernel failed.");
+    err = clEnqueueNDRangeKernel(queue, kernel[1], 1, nullptr, globalsize,
+                                 localsize, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed.");
-    err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL);
-    test_error( err, "clEnqueueReadBuffer failed.");
+    err = clEnqueueReadBuffer(queue, stream, CL_BLOCKING, 0, 2 * sizeof(int),
+                              output_ptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed.");
     err = verify_enqueued_local_size(output_ptr, localsize, 1);
-    // cleanup
-    clReleaseMemObject(streams);
-    clReleaseKernel(kernel[0]);
-    clReleaseKernel(kernel[1]);
-    clReleaseProgram(program[0]);
-    clReleaseProgram(program[1]);
-    free(output_ptr);
     return err;
diff --git a/test_conformance/basic/test_fpmath_float.cpp b/test_conformance/basic/test_fpmath_float.cpp
index 6e5deb4..60d509b 100644
--- a/test_conformance/basic/test_fpmath_float.cpp
+++ b/test_conformance/basic/test_fpmath_float.cpp
@@ -49,8 +49,6 @@
-static const float    MAX_ERR = 1e-5f;
 static int
 verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n)
diff --git a/test_conformance/basic/test_hiloeo.cpp b/test_conformance/basic/test_hiloeo.cpp
index 4cdf2ac..3470ad0 100644
--- a/test_conformance/basic/test_hiloeo.cpp
+++ b/test_conformance/basic/test_hiloeo.cpp
@@ -43,8 +43,6 @@
 // input type name is strcat(gentype, vector_size_names[i]);
 // and output type name is
 // strcat(gentype, vector_size_names[out_vector_idx[i]]);
-static const int size_to_idx[] = {-1,0,1,2,3,-1,-1,-1,4,
-    -1,-1,-1,-1,-1,-1,-1,5};
 static const char *vector_size_names[] = { "", "2", "3", "4", "8", "16"};
 static const size_t  kSizes[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 };
diff --git a/test_conformance/basic/test_hostptr.cpp b/test_conformance/basic/test_hostptr.cpp
index 65af5c3..dee7867 100644
--- a/test_conformance/basic/test_hostptr.cpp
+++ b/test_conformance/basic/test_hostptr.cpp
@@ -32,8 +32,6 @@
 "    dst[tid] = srcA[tid] + srcB[tid];\n"
-static const float    MAX_ERR = 1e-5f;
 static int verify_hostptr(cl_float *inptrA, cl_float *inptrB, cl_float *outptr, int n)
     cl_float       r;
diff --git a/test_conformance/basic/test_multireadimageonefmt.cpp b/test_conformance/basic/test_multireadimageonefmt.cpp
index b37c841..c230e67 100644
--- a/test_conformance/basic/test_multireadimageonefmt.cpp
+++ b/test_conformance/basic/test_multireadimageonefmt.cpp
@@ -153,14 +153,14 @@
   err  = clSetKernelArg(kernel, 0, sizeof i, &i);
   err |= clSetKernelArg(kernel, 1, sizeof err, &err);
   err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler);
-  for (i=0; i<8; i++)
-    err |= clSetKernelArg(kernel, 3+i, sizeof streams[i], &streams[i]);
+  for (i = 0; i < 8; i++)
+      err |= clSetKernelArg(kernel, 3 + i, sizeof streams[i], &streams[i]);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
+  if (err != CL_SUCCESS)
+  {
+      log_error("clSetKernelArgs failed\n");
+      return -1;
+  }
     threads[0] = (unsigned int)img_width;
     threads[1] = (unsigned int)img_height;
@@ -182,15 +182,13 @@
     // cleanup
-  for (i=0; i<8; i++)
-    clReleaseMemObject(streams[i]);
+  for (i = 0; i < 8; i++) clReleaseMemObject(streams[i]);
-  for (i=0; i<7; i++)
-    free(input_ptr[i]);
-    free(output_ptr);
+  for (i = 0; i < 7; i++) free(input_ptr[i]);
+  free(output_ptr);
-    return err;
+  return err;
diff --git a/test_conformance/basic/test_preprocessors.cpp b/test_conformance/basic/test_preprocessors.cpp
index 2038d15..e67487e 100644
--- a/test_conformance/basic/test_preprocessors.cpp
+++ b/test_conformance/basic/test_preprocessors.cpp
@@ -97,10 +97,10 @@
     char programSource[4096];
     char curFileName[512];
     char *programPtr = programSource;
-    int i = 0;
     snprintf(curFileName, 512, "%s", __FILE__);
 #ifdef _WIN32
     // Replace "\" with "\\"
+    int i = 0;
     while(curFileName[i] != '\0') {
         if (curFileName[i] == '\\') {
             int j = i + 1;
diff --git a/test_conformance/basic/test_progvar.cpp b/test_conformance/basic/test_progvar.cpp
index 62c0a6b..e202d27 100644
--- a/test_conformance/basic/test_progvar.cpp
+++ b/test_conformance/basic/test_progvar.cpp
@@ -15,12 +15,13 @@
 #include "harness/compat.h"
-// Bug: Missing in spec: atomic_intptr_t is always supported if device is 32-bits.
+// Bug: Missing in spec: atomic_intptr_t is always supported if device is
+// 32-bits.
 #define FLUSH fflush(stdout)
-#define MAX_STR 16*1024
+#define MAX_STR 16 * 1024
 #define ALIGNMENT 128
@@ -66,7 +67,11 @@
 static size_t l_max_global_id0 = 0;
 static cl_bool l_linker_available = false;
-#define check_error(errCode,msg,...) ((errCode != CL_SUCCESS) ? (log_error("ERROR: " msg "! (%s:%d)\n", ## __VA_ARGS__, __FILE__, __LINE__), 1) : 0)
+#define check_error(errCode, msg, ...)                                         \
+    ((errCode != CL_SUCCESS) ? (log_error("ERROR: " msg "! (%s:%d)\n",         \
+                                          ##__VA_ARGS__, __FILE__, __LINE__),  \
+                                1)                                             \
+                             : 0)
 // Info about types we can use for program scope variables.
@@ -75,110 +80,135 @@
 class TypeInfo {
-    TypeInfo() :
-        name(""),
-        m_buf_elem_type(""),
-        m_is_vecbase(false),
-        m_is_atomic(false),
-        m_is_like_size_t(false),
-        m_is_bool(false),
-        m_elem_type(0), m_num_elem(0),
-        m_size(0),
-        m_value_size(0)
-        {}
-    TypeInfo(const char* name_arg) :
-        name(name_arg),
-        m_buf_elem_type(name_arg),
-        m_is_vecbase(false),
-        m_is_atomic(false),
-        m_is_like_size_t(false),
-        m_is_bool(false),
-        m_elem_type(0), m_num_elem(0),
-        m_size(0),
-        m_value_size(0)
-        { }
+    TypeInfo()
+        : name(""), m_buf_elem_type(""), m_is_vecbase(false),
+          m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false),
+          m_elem_type(0), m_num_elem(0), m_size(0), m_value_size(0)
+    {}
+    TypeInfo(const char* name_arg)
+        : name(name_arg), m_buf_elem_type(name_arg), m_is_vecbase(false),
+          m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false),
+          m_elem_type(0), m_num_elem(0), m_size(0), m_value_size(0)
+    {}
     // Vectors
-    TypeInfo( TypeInfo* elem_type, int num_elem ) :
-        m_is_vecbase(false),
-        m_is_atomic(false),
-        m_is_like_size_t(false),
-        m_is_bool(false),
-        m_elem_type(elem_type),
-        m_num_elem(num_elem)
+    TypeInfo(TypeInfo* elem_type, int num_elem)
+        : m_is_vecbase(false), m_is_atomic(false), m_is_like_size_t(false),
+          m_is_bool(false), m_elem_type(elem_type), m_num_elem(num_elem)
-        char the_name[10]; // long enough for longest vector type name "double16"
-        snprintf(the_name,sizeof(the_name),"%s%d",elem_type->get_name_c_str(),m_num_elem);
+        char
+            the_name[10]; // long enough for longest vector type name "double16"
+        snprintf(the_name, sizeof(the_name), "%s%d",
+                 elem_type->get_name_c_str(), m_num_elem);
         this->name = std::string(the_name);
         this->m_buf_elem_type = std::string(the_name);
         this->m_value_size = num_elem * elem_type->get_size();
-        if ( m_num_elem == 3 ) {
+        if (m_num_elem == 3)
+        {
             this->m_size = 4 * elem_type->get_size();
-        } else {
+        }
+        else
+        {
             this->m_size = num_elem * elem_type->get_size();
     const std::string& get_name(void) const { return name; }
     const char* get_name_c_str(void) const { return name.c_str(); }
-    TypeInfo& set_vecbase(void) { this->m_is_vecbase = true; return *this; }
-    TypeInfo& set_atomic(void) { this->m_is_atomic = true; return *this; }
-    TypeInfo& set_like_size_t(void) {
+    TypeInfo& set_vecbase(void)
+    {
+        this->m_is_vecbase = true;
+        return *this;
+    }
+    TypeInfo& set_atomic(void)
+    {
+        this->m_is_atomic = true;
+        return *this;
+    }
+    TypeInfo& set_like_size_t(void)
+    {
         this->m_is_like_size_t = true;
-        this->set_size( l_64bit_device ? 8 : 4 );
+        this->set_size(l_64bit_device ? 8 : 4);
         this->m_buf_elem_type = l_64bit_device ? "ulong" : "uint";
         return *this;
-    TypeInfo& set_bool(void) { this->m_is_bool = true; return *this; }
-    TypeInfo& set_size(size_t n) { this->m_value_size = this->m_size = n; return *this; }
-    TypeInfo& set_buf_elem_type( const char* name ) { this->m_buf_elem_type = std::string(name); return *this; }
+    TypeInfo& set_bool(void)
+    {
+        this->m_is_bool = true;
+        return *this;
+    }
+    TypeInfo& set_size(size_t n)
+    {
+        this->m_value_size = this->m_size = n;
+        return *this;
+    }
+    TypeInfo& set_buf_elem_type(const char* name)
+    {
+        this->m_buf_elem_type = std::string(name);
+        return *this;
+    }
     const TypeInfo* elem_type(void) const { return m_elem_type; }
     int num_elem(void) const { return m_num_elem; }
-    bool is_vecbase(void) const {return m_is_vecbase;}
-    bool is_atomic(void) const {return m_is_atomic;}
-    bool is_atomic_64bit(void) const {return m_is_atomic && m_size == 8;}
-    bool is_like_size_t(void) const {return m_is_like_size_t;}
-    bool is_bool(void) const {return m_is_bool;}
-    size_t get_size(void) const {return m_size;}
-    size_t get_value_size(void) const {return m_value_size;}
+    bool is_vecbase(void) const { return m_is_vecbase; }
+    bool is_atomic(void) const { return m_is_atomic; }
+    bool is_atomic_64bit(void) const { return m_is_atomic && m_size == 8; }
+    bool is_like_size_t(void) const { return m_is_like_size_t; }
+    bool is_bool(void) const { return m_is_bool; }
+    size_t get_size(void) const { return m_size; }
+    size_t get_value_size(void) const { return m_value_size; }
     // When passing values of this type to a kernel, what buffer type
     // should be used?
-    const char* get_buf_elem_type(void) const { return m_buf_elem_type.c_str(); }
+    const char* get_buf_elem_type(void) const
+    {
+        return m_buf_elem_type.c_str();
+    }
-    std::string as_string(const cl_uchar* value_ptr) const {
+    std::string as_string(const cl_uchar* value_ptr) const
+    {
         // This method would be shorter if I had a real handle to element
         // vector type.
-        if ( this->is_bool() ) {
-            std::string result( name );
+        if (this->is_bool())
+        {
+            std::string result(name);
             result += "<";
             result += (*value_ptr ? "true" : "false");
             result += ", ";
             char buf[10];
-            sprintf(buf,"%02x",*value_ptr);
+            sprintf(buf, "%02x", *value_ptr);
             result += buf;
             result += ">";
             return result;
-        } else if ( this->num_elem() ) {
-            std::string result( name );
+        }
+        else if (this->num_elem())
+        {
+            std::string result(name);
             result += "<";
-            for ( unsigned ielem = 0 ; ielem < this->num_elem() ; ielem++ ) {
+            for (unsigned ielem = 0; ielem < this->num_elem(); ielem++)
+            {
                 char buf[MAX_STR];
-                if ( ielem ) result += ", ";
-                for ( unsigned ibyte = 0; ibyte < this->m_elem_type->get_size() ; ibyte++ ) {
-                    sprintf(buf + 2*ibyte,"%02x", value_ptr[ ielem * this->m_elem_type->get_size() + ibyte ] );
+                if (ielem) result += ", ";
+                for (unsigned ibyte = 0; ibyte < this->m_elem_type->get_size();
+                     ibyte++)
+                {
+                    sprintf(buf + 2 * ibyte, "%02x",
+                            value_ptr[ielem * this->m_elem_type->get_size()
+                                      + ibyte]);
                 result += buf;
             result += ">";
             return result;
-        } else {
-            std::string result( name );
+        }
+        else
+        {
+            std::string result(name);
             result += "<";
             char buf[MAX_STR];
-            for ( unsigned ibyte = 0; ibyte < this->get_size() ; ibyte++ ) {
-                sprintf(buf + 2*ibyte,"%02x", value_ptr[ ibyte ] );
+            for (unsigned ibyte = 0; ibyte < this->get_size(); ibyte++)
+            {
+                sprintf(buf + 2 * ibyte, "%02x", value_ptr[ibyte]);
             result += buf;
             result += ">";
@@ -189,51 +219,71 @@
     // Initialize the given buffer to a constant value initialized as if it
     // were from the INIT_VAR macro below.
     // Only needs to support values 0 and 1.
-    void init( cl_uchar* buf, cl_uchar val) const {
-        if ( this->num_elem() ) {
-            for ( unsigned ielem = 0 ; ielem < this->num_elem() ; ielem++ ) {
+    void init(cl_uchar* buf, cl_uchar val) const
+    {
+        if (this->num_elem())
+        {
+            for (unsigned ielem = 0; ielem < this->num_elem(); ielem++)
+            {
                 // Delegate!
-                this->init_elem( buf + ielem * this->get_value_size()/this->num_elem(), val );
+                this->init_elem(
+                    buf + ielem * this->get_value_size() / this->num_elem(),
+                    val);
-        } else {
-            init_elem( buf, val );
+        }
+        else
+        {
+            init_elem(buf, val);
-    void init_elem( cl_uchar* buf, cl_uchar val ) const {
-        size_t elem_size = this->num_elem() ? this->get_value_size()/this->num_elem() : this->get_size();
-        memset(buf,0,elem_size);
-        if ( val ) {
-            if ( strstr( name.c_str(), "float" ) ) {
+    void init_elem(cl_uchar* buf, cl_uchar val) const
+    {
+        size_t elem_size = this->num_elem()
+            ? this->get_value_size() / this->num_elem()
+            : this->get_size();
+        memset(buf, 0, elem_size);
+        if (val)
+        {
+            if (strstr(name.c_str(), "float"))
+            {
                 *(float*)buf = (float)val;
-            if ( strstr( name.c_str(), "double" ) ) {
+            if (strstr(name.c_str(), "double"))
+            {
                 *(double*)buf = (double)val;
-            if ( this->is_bool() ) { *buf = (bool)val; return; }
+            if (this->is_bool())
+            {
+                *buf = (bool)val;
+                return;
+            }
             // Write a single character value to the correct spot,
             // depending on host endianness.
-            if ( l_host_is_big_endian ) *(buf + elem_size-1) = (cl_uchar)val;
-            else *buf = (cl_uchar)val;
+            if (l_host_is_big_endian)
+                *(buf + elem_size - 1) = (cl_uchar)val;
+            else
+                *buf = (cl_uchar)val;
-    void dump(FILE* fp) const {
-        fprintf(fp,"Type %s : <%d,%d,%s> ", name.c_str(),
-                (int)m_size,
-                (int)m_value_size,
-                m_buf_elem_type.c_str() );
-        if ( this->m_elem_type ) fprintf(fp, " vec(%s,%d)", this->m_elem_type->get_name_c_str(), this->num_elem() );
-        if ( this->m_is_vecbase ) fprintf(fp, " vecbase");
-        if ( this->m_is_bool ) fprintf(fp, " bool");
-        if ( this->m_is_like_size_t ) fprintf(fp, " like-size_t");
-        if ( this->m_is_atomic ) fprintf(fp, " atomic");
-        fprintf(fp,"\n");
+    void dump(FILE* fp) const
+    {
+        fprintf(fp, "Type %s : <%d,%d,%s> ", name.c_str(), (int)m_size,
+                (int)m_value_size, m_buf_elem_type.c_str());
+        if (this->m_elem_type)
+            fprintf(fp, " vec(%s,%d)", this->m_elem_type->get_name_c_str(),
+                    this->num_elem());
+        if (this->m_is_vecbase) fprintf(fp, " vecbase");
+        if (this->m_is_bool) fprintf(fp, " bool");
+        if (this->m_is_like_size_t) fprintf(fp, " like-size_t");
+        if (this->m_is_atomic) fprintf(fp, " atomic");
+        fprintf(fp, "\n");
@@ -246,7 +296,8 @@
     bool m_is_like_size_t;
     bool m_is_bool;
     size_t m_size; // Number of bytes of storage occupied by this type.
-    size_t m_value_size; // Number of bytes of value significant for this type. Differs for vec3.
+    size_t m_value_size; // Number of bytes of value significant for this type.
+                         // Differs for vec3.
     // When passing values of this type to a kernel, what buffer type
     // should be used?
@@ -256,46 +307,65 @@
-#define NUM_SCALAR_TYPES (8+2) // signed and unsigned integral types, float and double
-#define NUM_VECTOR_SIZES (5)   // 2,3,4,8,16
-#define NUM_PLAIN_TYPES \
-      5 /*boolean and size_t family */  \
-    + 10 /* atomic types */
+#define NUM_SCALAR_TYPES                                                       \
+    (8 + 2) // signed and unsigned integral types, float and double
+#define NUM_VECTOR_SIZES (5) // 2,3,4,8,16
+#define NUM_PLAIN_TYPES                                                        \
+    5 /*boolean and size_t family */                                           \
+        + 10 /* atomic types */
 // Need room for plain, array, pointer, struct
 static TypeInfo type_info[MAX_TYPES];
 static int num_type_info = 0; // Number of valid entries in type_info[]
 // A helper class to form kernel source arguments for clCreateProgramWithSource.
 class StringTable {
-    StringTable() : m_c_strs(NULL), m_lengths(NULL), m_frozen(false), m_strings() {}
+    StringTable(): m_c_strs(NULL), m_lengths(NULL), m_frozen(false), m_strings()
+    {}
     ~StringTable() { release_frozen(); }
-    void add(std::string s) { release_frozen(); m_strings.push_back(s); }
+    void add(std::string s)
+    {
+        release_frozen();
+        m_strings.push_back(s);
+    }
-    const size_t num_str() { freeze(); return m_strings.size(); }
-    const char** strs() { freeze(); return m_c_strs; }
-    const size_t* lengths() { freeze(); return m_lengths; }
+    const size_t num_str()
+    {
+        freeze();
+        return m_strings.size();
+    }
+    const char** strs()
+    {
+        freeze();
+        return m_c_strs;
+    }
+    const size_t* lengths()
+    {
+        freeze();
+        return m_lengths;
+    }
-    void freeze(void) {
-        if ( !m_frozen ) {
+    void freeze(void)
+    {
+        if (!m_frozen)
+        {
-            m_c_strs = (const char**) malloc(sizeof(const char*) * m_strings.size());
-            m_lengths = (size_t*) malloc(sizeof(size_t) * m_strings.size());
-            assert( m_c_strs );
-            assert( m_lengths );
+            m_c_strs =
+                (const char**)malloc(sizeof(const char*) * m_strings.size());
+            m_lengths = (size_t*)malloc(sizeof(size_t) * m_strings.size());
+            assert(m_c_strs);
+            assert(m_lengths);
-            for ( size_t i = 0; i < m_strings.size() ; i++ ) {
+            for (size_t i = 0; i < m_strings.size(); i++)
+            {
                 m_c_strs[i] = m_strings[i].c_str();
                 m_lengths[i] = strlen(m_c_strs[i]);
@@ -303,9 +373,18 @@
             m_frozen = true;
-    void release_frozen(void) {
-        if ( m_c_strs ) { free(m_c_strs); m_c_strs = 0; }
-        if ( m_lengths ) { free(m_lengths); m_lengths = 0; }
+    void release_frozen(void)
+    {
+        if (m_c_strs)
+        {
+            free(m_c_strs);
+            m_c_strs = 0;
+        }
+        if (m_lengths)
+        {
+            free(m_lengths);
+            m_lengths = 0;
+        }
         m_frozen = false;
@@ -325,11 +404,15 @@
 static const char* l_get_cles_int64_pragma(void);
 static int l_build_type_table(cl_device_id device);
-static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret);
+static int l_get_device_info(cl_device_id device, size_t* max_size_ret,
+                             size_t* pref_size_ret);
-static void l_set_randomly( cl_uchar* buf, size_t buf_size, RandomSeed& rand_state );
-static int l_compare( const cl_uchar* expected, const cl_uchar* received, unsigned num_values, const TypeInfo&ti );
-static int l_copy( cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo&ti );
+static void l_set_randomly(cl_uchar* buf, size_t buf_size,
+                           RandomSeed& rand_state);
+static int l_compare(const cl_uchar* expected, const cl_uchar* received,
+                     unsigned num_values, const TypeInfo& ti);
+static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src,
+                  unsigned src_idx, const TypeInfo& ti);
 static std::string conversion_functions(const TypeInfo& ti);
 static std::string global_decls(const TypeInfo& ti, bool with_init);
@@ -337,90 +420,123 @@
 static std::string writer_function(const TypeInfo& ti);
 static std::string reader_function(const TypeInfo& ti);
-static int l_write_read( cl_device_id device, cl_context context, cl_command_queue queue );
-static int l_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state );
+static int l_write_read(cl_device_id device, cl_context context,
+                        cl_command_queue queue);
+static int l_write_read_for_type(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, const TypeInfo& ti,
+                                 RandomSeed& rand_state);
-static int l_init_write_read( cl_device_id device, cl_context context, cl_command_queue queue );
-static int l_init_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state );
+static int l_init_write_read(cl_device_id device, cl_context context,
+                             cl_command_queue queue);
+static int l_init_write_read_for_type(cl_device_id device, cl_context context,
+                                      cl_command_queue queue,
+                                      const TypeInfo& ti,
+                                      RandomSeed& rand_state);
-static int l_capacity( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size );
-static int l_user_type( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size, bool separate_compilation );
+static int l_capacity(cl_device_id device, cl_context context,
+                      cl_command_queue queue, size_t max_size);
+static int l_user_type(cl_device_id device, cl_context context,
+                       cl_command_queue queue, size_t max_size,
+                       bool separate_compilation);
 // File scope function definitions
-static cl_int print_build_log(cl_program program, cl_uint num_devices, cl_device_id *device_list, cl_uint count, const char **strings, const size_t *lengths, const char* options)
+static cl_int print_build_log(cl_program program, cl_uint num_devices,
+                              cl_device_id* device_list, cl_uint count,
+                              const char** strings, const size_t* lengths,
+                              const char* options)
     cl_uint i;
     cl_int error;
     BufferOwningPtr<cl_device_id> devices;
-    if(num_devices == 0 || device_list == NULL)
+    if (num_devices == 0 || device_list == NULL)
-        error = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(num_devices), &num_devices, NULL);
+        error = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
+                                 sizeof(num_devices), &num_devices, NULL);
         test_error(error, "clGetProgramInfo CL_PROGRAM_NUM_DEVICES failed");
-        device_list = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
+        device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices);
         memset(device_list, 0, sizeof(cl_device_id) * num_devices);
-        error = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * num_devices, device_list, NULL);
+        error = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
+                                 sizeof(cl_device_id) * num_devices,
+                                 device_list, NULL);
         test_error(error, "clGetProgramInfo CL_PROGRAM_DEVICES failed");
     cl_uint z;
     bool sourcePrinted = false;
-    for(z = 0; z < num_devices; z++)
+    for (z = 0; z < num_devices; z++)
         char deviceName[4096] = "";
-        error = clGetDeviceInfo(device_list[z], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
-        check_error(error, "Device \"%d\" failed to return a name. clGetDeviceInfo CL_DEVICE_NAME failed", z);
+        error = clGetDeviceInfo(device_list[z], CL_DEVICE_NAME,
+                                sizeof(deviceName), deviceName, NULL);
+        check_error(error,
+                    "Device \"%d\" failed to return a name. clGetDeviceInfo "
+                    "CL_DEVICE_NAME failed",
+                    z);
         cl_build_status buildStatus;
-        error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_STATUS, sizeof(buildStatus), &buildStatus, NULL);
-        check_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_STATUS failed");
+        error = clGetProgramBuildInfo(program, device_list[z],
+                                      CL_PROGRAM_BUILD_STATUS,
+                                      sizeof(buildStatus), &buildStatus, NULL);
+        check_error(error,
+                    "clGetProgramBuildInfo CL_PROGRAM_BUILD_STATUS failed");
-        if(buildStatus != CL_BUILD_SUCCESS)
+        if (buildStatus != CL_BUILD_SUCCESS)
-            if(!sourcePrinted)
+            if (!sourcePrinted)
                 log_error("Build options: %s\n", options);
-                if(count && strings)
+                if (count && strings)
                     log_error("Original source is: ------------\n");
-                    for(i = 0; i < count; i++) log_error("%s", strings[i]);
+                    for (i = 0; i < count; i++) log_error("%s", strings[i]);
                 sourcePrinted = true;
             char statusString[64] = "";
             if (buildStatus == (cl_build_status)CL_BUILD_SUCCESS)
-              sprintf(statusString, "CL_BUILD_SUCCESS");
+                sprintf(statusString, "CL_BUILD_SUCCESS");
             else if (buildStatus == (cl_build_status)CL_BUILD_NONE)
-              sprintf(statusString, "CL_BUILD_NONE");
+                sprintf(statusString, "CL_BUILD_NONE");
             else if (buildStatus == (cl_build_status)CL_BUILD_ERROR)
-              sprintf(statusString, "CL_BUILD_ERROR");
+                sprintf(statusString, "CL_BUILD_ERROR");
             else if (buildStatus == (cl_build_status)CL_BUILD_IN_PROGRESS)
-              sprintf(statusString, "CL_BUILD_IN_PROGRESS");
+                sprintf(statusString, "CL_BUILD_IN_PROGRESS");
-              sprintf(statusString, "UNKNOWN (%d)", buildStatus);
+                sprintf(statusString, "UNKNOWN (%d)", buildStatus);
-            log_error("Build not successful for device \"%s\", status: %s\n", deviceName, statusString);
+            log_error("Build not successful for device \"%s\", status: %s\n",
+                      deviceName, statusString);
             size_t paramSize = 0;
-            error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, 0, NULL, &paramSize);
-            if(check_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed")) break;
+            error = clGetProgramBuildInfo(program, device_list[z],
+                                          CL_PROGRAM_BUILD_LOG, 0, NULL,
+                                          &paramSize);
+            if (check_error(
+                    error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed"))
+                break;
             std::string log;
-            log.resize(paramSize/sizeof(char));
+            log.resize(paramSize / sizeof(char));
-            error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, paramSize, &log[0], NULL);
-            if(check_error(error, "Device %d (%s) failed to return a build log", z, deviceName)) break;
-            if(log[0] == 0) log_error("clGetProgramBuildInfo returned an empty log.\n");
+            error = clGetProgramBuildInfo(program, device_list[z],
+                                          CL_PROGRAM_BUILD_LOG, paramSize,
+                                          &log[0], NULL);
+            if (check_error(error,
+                            "Device %d (%s) failed to return a build log", z,
+                            deviceName))
+                break;
+            if (log[0] == 0)
+                log_error("clGetProgramBuildInfo returned an empty log.\n");
                 log_error("Build log:\n", deviceName);
@@ -433,25 +549,29 @@
 static void l_load_abilities(cl_device_id device)
-    l_has_half       = is_extension_available(device,"cl_khr_fp16");
-    l_has_double     = is_extension_available(device,"cl_khr_fp64");
-    l_has_cles_int64 = is_extension_available(device,"cles_khr_int64");
+    l_has_half = is_extension_available(device, "cl_khr_fp16");
+    l_has_double = is_extension_available(device, "cl_khr_fp64");
+    l_has_cles_int64 = is_extension_available(device, "cles_khr_int64");
-    l_has_int64_atomics
-    =  is_extension_available(device,"cl_khr_int64_base_atomics")
-    && is_extension_available(device,"cl_khr_int64_extended_atomics");
+    l_has_int64_atomics =
+        is_extension_available(device, "cl_khr_int64_base_atomics")
+        && is_extension_available(device, "cl_khr_int64_extended_atomics");
         int status = CL_SUCCESS;
         cl_uint addr_bits = 32;
-        status = clGetDeviceInfo(device,CL_DEVICE_ADDRESS_BITS,sizeof(addr_bits),&addr_bits,0);
-        l_64bit_device = ( status == CL_SUCCESS && addr_bits == 64 );
+        status = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS,
+                                 sizeof(addr_bits), &addr_bits, 0);
+        l_64bit_device = (status == CL_SUCCESS && addr_bits == 64);
     // 32-bit devices always have intptr atomics.
     l_has_intptr_atomics = !l_64bit_device || l_has_int64_atomics;
-    union { char c[4]; int i; } probe;
+    union {
+        char c[4];
+        int i;
+    } probe;
     probe.i = 1;
     l_host_is_big_endian = !probe.c[0];
@@ -459,33 +579,40 @@
         int status = CL_SUCCESS;
         cl_uint max_dim = 0;
-        status = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(max_dim),&max_dim,0);
-        assert( status == CL_SUCCESS );
-        assert( max_dim > 0 );
+        status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                                 sizeof(max_dim), &max_dim, 0);
+        assert(status == CL_SUCCESS);
+        assert(max_dim > 0);
         size_t max_id[3];
         max_id[0] = 0;
-    status = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES,max_dim*sizeof(size_t),&max_id[0],0);
-        assert( status == CL_SUCCESS );
+        status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                                 max_dim * sizeof(size_t), &max_id[0], 0);
+        assert(status == CL_SUCCESS);
         l_max_global_id0 = max_id[0];
     { // Is separate compilation supported?
         int status = CL_SUCCESS;
         l_linker_available = false;
-        status = clGetDeviceInfo(device,CL_DEVICE_LINKER_AVAILABLE,sizeof(l_linker_available),&l_linker_available,0);
-        assert( status == CL_SUCCESS );
+        status =
+            clGetDeviceInfo(device, CL_DEVICE_LINKER_AVAILABLE,
+                            sizeof(l_linker_available), &l_linker_available, 0);
+        assert(status == CL_SUCCESS);
 static const char* l_get_fp64_pragma(void)
-    return l_has_double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" : "";
+    return l_has_double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+                        : "";
 static const char* l_get_cles_int64_pragma(void)
-    return l_has_cles_int64 ? "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n" : "";
+    return l_has_cles_int64
+        ? "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
+        : "";
 static const char* l_get_int64_atomic_pragma(void)
@@ -500,89 +627,83 @@
     size_t iscalar = 0;
     size_t ivecsize = 0;
     int vecsizes[] = { 2, 3, 4, 8, 16 };
-    const char* vecbase[] = {
-        "uchar", "char",
-        "ushort", "short",
-        "uint", "int",
-        "ulong", "long",
-        "float",
-        "double"
-    };
-    int vecbase_size[] = {
-        1, 1,
-        2, 2,
-        4, 4,
-        8, 8,
-        4,
-        8
-    };
-    const char* like_size_t[] = {
-        "intptr_t",
-        "uintptr_t",
-        "size_t",
-        "ptrdiff_t"
-    };
+    const char* vecbase[] = { "uchar", "char",  "ushort", "short", "uint",
+                              "int",   "ulong", "long",   "float", "double" };
+    int vecbase_size[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 };
+    const char* like_size_t[] = { "intptr_t", "uintptr_t", "size_t",
+                                  "ptrdiff_t" };
     const char* atomics[] = {
-        "atomic_int", "atomic_uint",
-        "atomic_long", "atomic_ulong",
-        "atomic_float",
-        "atomic_double",
+        "atomic_int",   "atomic_uint",  "atomic_long",
+        "atomic_ulong", "atomic_float", "atomic_double",
-    int atomics_size[] = {
-        4, 4,
-        8, 8,
-        4,
-        8
-    };
-    const char* intptr_atomics[] = {
-        "atomic_intptr_t",
-        "atomic_uintptr_t",
-        "atomic_size_t",
-        "atomic_ptrdiff_t"
-    };
+    int atomics_size[] = { 4, 4, 8, 8, 4, 8 };
+    const char* intptr_atomics[] = { "atomic_intptr_t", "atomic_uintptr_t",
+                                     "atomic_size_t", "atomic_ptrdiff_t" };
     num_type_info = 0;
     // Boolean.
-    type_info[ num_type_info++ ] = TypeInfo( "bool" ).set_bool().set_size(1).set_buf_elem_type("uchar");
+    type_info[num_type_info++] =
+        TypeInfo("bool").set_bool().set_size(1).set_buf_elem_type("uchar");
     // Vector types, and the related scalar element types.
-    for ( iscalar=0; iscalar < sizeof(vecbase)/sizeof(vecbase[0]) ; ++iscalar ) {
-        if ( !gHasLong && strstr(vecbase[iscalar],"long") ) continue;
-        if ( !l_has_double && strstr(vecbase[iscalar],"double") ) continue;
+    for (iscalar = 0; iscalar < sizeof(vecbase) / sizeof(vecbase[0]); ++iscalar)
+    {
+        if (!gHasLong && strstr(vecbase[iscalar], "long")) continue;
+        if (!l_has_double && strstr(vecbase[iscalar], "double")) continue;
         // Scalar
         TypeInfo* elem_type = type_info + num_type_info++;
-        *elem_type = TypeInfo( vecbase[iscalar] ).set_vecbase().set_size( vecbase_size[iscalar] );
+        *elem_type = TypeInfo(vecbase[iscalar])
+                         .set_vecbase()
+                         .set_size(vecbase_size[iscalar]);
         // Vector
-        for ( ivecsize=0; ivecsize < sizeof(vecsizes)/sizeof(vecsizes[0]) ; ivecsize++ ) {
-            type_info[ num_type_info++ ] = TypeInfo( elem_type, vecsizes[ivecsize] );
+        for (ivecsize = 0; ivecsize < sizeof(vecsizes) / sizeof(vecsizes[0]);
+             ivecsize++)
+        {
+            type_info[num_type_info++] =
+                TypeInfo(elem_type, vecsizes[ivecsize]);
     // Size_t-like types
-    for ( iscalar=0; iscalar < sizeof(like_size_t)/sizeof(like_size_t[0]) ; ++iscalar ) {
-        type_info[ num_type_info++ ] = TypeInfo( like_size_t[iscalar] ).set_like_size_t();
+    for (iscalar = 0; iscalar < sizeof(like_size_t) / sizeof(like_size_t[0]);
+         ++iscalar)
+    {
+        type_info[num_type_info++] =
+            TypeInfo(like_size_t[iscalar]).set_like_size_t();
     // Atomic types.
-    for ( iscalar=0; iscalar < sizeof(atomics)/sizeof(atomics[0]) ; ++iscalar ) {
-        if ( !l_has_int64_atomics && strstr(atomics[iscalar],"long") ) continue;
-        if ( !(l_has_int64_atomics && l_has_double) && strstr(atomics[iscalar],"double") ) continue;
+    for (iscalar = 0; iscalar < sizeof(atomics) / sizeof(atomics[0]); ++iscalar)
+    {
+        if (!l_has_int64_atomics && strstr(atomics[iscalar], "long")) continue;
+        if (!(l_has_int64_atomics && l_has_double)
+            && strstr(atomics[iscalar], "double"))
+            continue;
         // The +7 is used to skip over the "atomic_" prefix.
         const char* buf_type = atomics[iscalar] + 7;
-        type_info[ num_type_info++ ] = TypeInfo( atomics[iscalar] ).set_atomic().set_size( atomics_size[iscalar] ).set_buf_elem_type( buf_type );
+        type_info[num_type_info++] = TypeInfo(atomics[iscalar])
+                                         .set_atomic()
+                                         .set_size(atomics_size[iscalar])
+                                         .set_buf_elem_type(buf_type);
-    if ( l_has_intptr_atomics ) {
-        for ( iscalar=0; iscalar < sizeof(intptr_atomics)/sizeof(intptr_atomics[0]) ; ++iscalar ) {
-            type_info[ num_type_info++ ] = TypeInfo( intptr_atomics[iscalar] ).set_atomic().set_like_size_t();
+    if (l_has_intptr_atomics)
+    {
+        for (iscalar = 0;
+             iscalar < sizeof(intptr_atomics) / sizeof(intptr_atomics[0]);
+             ++iscalar)
+        {
+            type_info[num_type_info++] = TypeInfo(intptr_atomics[iscalar])
+                                             .set_atomic()
+                                             .set_like_size_t();
-    assert( num_type_info <= MAX_TYPES ); // or increase MAX_TYPES
+    assert(num_type_info <= MAX_TYPES); // or increase MAX_TYPES
 #if 0
     for ( size_t i = 0 ; i < num_type_info ; i++ ) {
@@ -594,7 +715,7 @@
     return status;
-static const TypeInfo& l_find_type( const char* name )
+static const TypeInfo& l_find_type(const char* name)
     auto itr =
         std::find_if(type_info, type_info + num_type_info,
@@ -604,36 +725,54 @@
+// Populate return parameters for max program variable size, preferred program
+// variable size.
-// Populate return parameters for max program variable size, preferred program variable size.
-static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret)
+static int l_get_device_info(cl_device_id device, size_t* max_size_ret,
+                             size_t* pref_size_ret)
     int err = CL_SUCCESS;
     size_t return_size = 0;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, sizeof(*max_size_ret), max_size_ret, &return_size);
-    if ( err != CL_SUCCESS ) {
-        log_error("Error: Failed to get device info for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n");
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE,
+                          sizeof(*max_size_ret), max_size_ret, &return_size);
+    if (err != CL_SUCCESS)
+    {
+        log_error("Error: Failed to get device info for "
+                  "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n");
         return err;
-    if ( return_size != sizeof(size_t) ) {
-        log_error("Error: Invalid size %d returned for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size );
+    if (return_size != sizeof(size_t))
+    {
+        log_error("Error: Invalid size %d returned for "
+                  "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n",
+                  (int)return_size);
         return 1;
-    if ( return_size != sizeof(size_t) ) {
-        log_error("Error: Invalid size %d returned for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size );
+    if (return_size != sizeof(size_t))
+    {
+        log_error("Error: Invalid size %d returned for "
+                  "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n",
+                  (int)return_size);
         return 1;
     return_size = 0;
-    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, sizeof(*pref_size_ret), pref_size_ret, &return_size);
-    if ( err != CL_SUCCESS ) {
-        log_error("Error: Failed to get device info for CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: %d\n",err);
+    err =
+                        sizeof(*pref_size_ret), pref_size_ret, &return_size);
+    if (err != CL_SUCCESS)
+    {
+        log_error("Error: Failed to get device info for "
+                  err);
         return err;
-    if ( return_size != sizeof(size_t) ) {
-        log_error("Error: Invalid size %d returned for CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE\n", (int)return_size );
+    if (return_size != sizeof(size_t))
+    {
+        log_error("Error: Invalid size %d returned for "
+                  (int)return_size);
         return 1;
@@ -641,11 +780,13 @@
-static void l_set_randomly( cl_uchar* buf, size_t buf_size, RandomSeed& rand_state )
+static void l_set_randomly(cl_uchar* buf, size_t buf_size,
+                           RandomSeed& rand_state)
-    assert( 0 == (buf_size % sizeof(cl_uint) ) );
-    for ( size_t i = 0; i < buf_size ; i += sizeof(cl_uint) ) {
-        *( (cl_uint*)(buf + i) ) = genrand_int32( rand_state );
+    assert(0 == (buf_size % sizeof(cl_uint)));
+    for (size_t i = 0; i < buf_size; i += sizeof(cl_uint))
+    {
+        *((cl_uint*)(buf + i)) = genrand_int32(rand_state);
 #if 0
     for ( size_t i = 0; i < buf_size ; i++ ) {
@@ -657,20 +798,23 @@
 // Return num_value values of the given type.
 // Returns CL_SUCCESS if they compared as equal.
-static int l_compare( const char* test_name, const cl_uchar* expected, const cl_uchar* received, size_t num_values, const TypeInfo&ti )
+static int l_compare(const char* test_name, const cl_uchar* expected,
+                     const cl_uchar* received, size_t num_values,
+                     const TypeInfo& ti)
     // Compare only the valid returned bytes.
-    for ( unsigned value_idx = 0; value_idx < num_values; value_idx++ ) {
+    for (unsigned value_idx = 0; value_idx < num_values; value_idx++)
+    {
         const cl_uchar* expv = expected + value_idx * ti.get_size();
         const cl_uchar* gotv = received + value_idx * ti.get_size();
-        if ( memcmp( expv, gotv, ti.get_value_size() ) ) {
-            std::string exp_str = ti.as_string( expv );
-            std::string got_str = ti.as_string( gotv );
-            log_error("Error: %s test for type %s, at index %d: Expected %s got %s\n",
-                    test_name,
-                    ti.get_name_c_str(), value_idx,
-                    exp_str.c_str(),
-                    got_str.c_str() );
+        if (memcmp(expv, gotv, ti.get_value_size()))
+        {
+            std::string exp_str = ti.as_string(expv);
+            std::string got_str = ti.as_string(gotv);
+            log_error(
+                "Error: %s test for type %s, at index %d: Expected %s got %s\n",
+                test_name, ti.get_name_c_str(), value_idx, exp_str.c_str(),
+                got_str.c_str());
             return 1;
@@ -678,11 +822,12 @@
 // Copy a target value from src[idx] to dest[idx]
-static int l_copy( cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo&ti )
+static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src,
+                  unsigned src_idx, const TypeInfo& ti)
-    cl_uchar* raw_dest      = dest + dest_idx * ti.get_size();
-    const cl_uchar* raw_src =  src +  src_idx * ti.get_size();
-    memcpy( raw_dest, raw_src,  ti.get_value_size() );
+    cl_uchar* raw_dest = dest + dest_idx * ti.get_size();
+    const cl_uchar* raw_src = src + src_idx * ti.get_size();
+    memcpy(raw_dest, raw_src, ti.get_value_size());
     return 0;
@@ -694,59 +839,70 @@
     static char buf[MAX_STR];
     int num_printed = 0;
     // The atomic types just use the base type.
-    if ( ti.is_atomic() || 0 == strcmp( ti.get_buf_elem_type(), ti.get_name_c_str() ) ) {
+    if (ti.is_atomic()
+        || 0 == strcmp(ti.get_buf_elem_type(), ti.get_name_c_str()))
+    {
         // The type is represented in a buffer by itself.
-        num_printed = snprintf(buf,MAX_STR,
-                "%s from_buf(%s a) { return a; }\n"
-                "%s to_buf(%s a) { return a; }\n",
-                ti.get_buf_elem_type(), ti.get_buf_elem_type(),
-                ti.get_buf_elem_type(), ti.get_buf_elem_type() );
-    } else {
+        num_printed = snprintf(buf, MAX_STR,
+                               "%s from_buf(%s a) { return a; }\n"
+                               "%s to_buf(%s a) { return a; }\n",
+                               ti.get_buf_elem_type(), ti.get_buf_elem_type(),
+                               ti.get_buf_elem_type(), ti.get_buf_elem_type());
+    }
+    else
+    {
         // Just use C-style cast.
-        num_printed = snprintf(buf,MAX_STR,
-                "%s from_buf(%s a) { return (%s)a; }\n"
-                "%s to_buf(%s a) { return (%s)a; }\n",
-                ti.get_name_c_str(), ti.get_buf_elem_type(), ti.get_name_c_str(),
-                ti.get_buf_elem_type(), ti.get_name_c_str(), ti.get_buf_elem_type() );
+        num_printed = snprintf(buf, MAX_STR,
+                               "%s from_buf(%s a) { return (%s)a; }\n"
+                               "%s to_buf(%s a) { return (%s)a; }\n",
+                               ti.get_name_c_str(), ti.get_buf_elem_type(),
+                               ti.get_name_c_str(), ti.get_buf_elem_type(),
+                               ti.get_name_c_str(), ti.get_buf_elem_type());
     // Add initializations.
-    if ( ti.is_atomic() ) {
-        num_printed += snprintf( buf + num_printed, MAX_STR-num_printed,
-                "#define INIT_VAR(a) ATOMIC_VAR_INIT(a)\n" );
-    } else {
-        // This cast works even if the target type is a vector type.
-        num_printed += snprintf( buf + num_printed, MAX_STR-num_printed,
-                "#define INIT_VAR(a) ((%s)(a))\n", ti.get_name_c_str());
+    if (ti.is_atomic())
+    {
+        num_printed += snprintf(buf + num_printed, MAX_STR - num_printed,
+                                "#define INIT_VAR(a) ATOMIC_VAR_INIT(a)\n");
-    assert( num_printed < MAX_STR ); // or increase MAX_STR
+    else
+    {
+        // This cast works even if the target type is a vector type.
+        num_printed +=
+            snprintf(buf + num_printed, MAX_STR - num_printed,
+                     "#define INIT_VAR(a) ((%s)(a))\n", ti.get_name_c_str());
+    }
+    assert(num_printed < MAX_STR); // or increase MAX_STR
     result = buf;
     return result;
-static std::string global_decls(const TypeInfo& ti, bool with_init )
+static std::string global_decls(const TypeInfo& ti, bool with_init)
     const char* tn = ti.get_name_c_str();
     const char* vol = (ti.is_atomic() ? " volatile " : " ");
     static char decls[MAX_STR];
     int num_printed = 0;
-    if ( with_init ) {
-        const char *decls_template_with_init =
+    if (with_init)
+    {
+        const char* decls_template_with_init =
             "%s %s var = INIT_VAR(0);\n"
             "global %s %s g_var = INIT_VAR(1);\n"
             "%s %s a_var[2] = { INIT_VAR(1), INIT_VAR(1) };\n"
             "volatile global %s %s* p_var = &a_var[1];\n\n";
-        num_printed = snprintf(decls,sizeof(decls),decls_template_with_init,
-                vol,tn,vol,tn,vol,tn,vol,tn);
-    } else {
-        const char *decls_template_no_init =
-            "%s %s var;\n"
-            "global %s %s g_var;\n"
-            "%s %s a_var[2];\n"
-            "global %s %s* p_var;\n\n";
-        num_printed = snprintf(decls,sizeof(decls),decls_template_no_init,
-             vol,tn,vol,tn,vol,tn,vol,tn);
+        num_printed = snprintf(decls, sizeof(decls), decls_template_with_init,
+                               vol, tn, vol, tn, vol, tn, vol, tn);
-    assert( num_printed < sizeof(decls) );
+    else
+    {
+        const char* decls_template_no_init = "%s %s var;\n"
+                                             "global %s %s g_var;\n"
+                                             "%s %s a_var[2];\n"
+                                             "global %s %s* p_var;\n\n";
+        num_printed = snprintf(decls, sizeof(decls), decls_template_no_init,
+                               vol, tn, vol, tn, vol, tn, vol, tn);
+    }
+    assert(num_printed < sizeof(decls));
     return std::string(decls);
@@ -761,18 +917,26 @@
     // all() should only be used on vector inputs. For scalar comparison, the
     // result of the equality operator can be used as a bool value.
-    const bool is_scalar = ti.num_elem() == 0; // 0 is used to represent scalar types, not 1.
+    const bool is_scalar =
+        ti.num_elem() == 0; // 0 is used to represent scalar types, not 1.
     const std::string is_equality_true = is_scalar ? "" : "all";
     std::string code = "kernel void global_check(global int* out) {\n";
     code += "  const " + type_name + " zero = ((" + type_name + ")0);\n";
     code += "  bool status = true;\n";
-    if (ti.is_atomic()) {
-        code += "  status &= " + is_equality_true + "(atomic_load(&var) == zero);\n";
-        code += "  status &= " + is_equality_true + "(atomic_load(&g_var) == zero);\n";
-        code += "  status &= " + is_equality_true + "(atomic_load(&a_var[0]) == zero);\n";
-        code += "  status &= " + is_equality_true + "(atomic_load(&a_var[1]) == zero);\n";
-    } else {
+    if (ti.is_atomic())
+    {
+        code += "  status &= " + is_equality_true
+            + "(atomic_load(&var) == zero);\n";
+        code += "  status &= " + is_equality_true
+            + "(atomic_load(&g_var) == zero);\n";
+        code += "  status &= " + is_equality_true
+            + "(atomic_load(&a_var[0]) == zero);\n";
+        code += "  status &= " + is_equality_true
+            + "(atomic_load(&a_var[1]) == zero);\n";
+    }
+    else
+    {
         code += "  status &= " + is_equality_true + "(var == zero);\n";
         code += "  status &= " + is_equality_true + "(g_var == zero);\n";
         code += "  status &= " + is_equality_true + "(a_var[0] == zero);\n";
@@ -792,7 +956,8 @@
     static char writer_src[MAX_STR];
     int num_printed = 0;
-    if ( !ti.is_atomic() ) {
+    if (!ti.is_atomic())
+    {
         const char* writer_template_normal =
             "kernel void writer( global %s* src, uint idx ) {\n"
             "  var = from_buf(src[0]);\n"
@@ -801,8 +966,11 @@
             "  a_var[1] = from_buf(src[3]);\n"
             "  p_var = a_var + idx;\n"
-        num_printed = snprintf(writer_src,sizeof(writer_src),writer_template_normal,ti.get_buf_elem_type());
-    } else {
+        num_printed = snprintf(writer_src, sizeof(writer_src),
+                               writer_template_normal, ti.get_buf_elem_type());
+    }
+    else
+    {
         const char* writer_template_atomic =
             "kernel void writer( global %s* src, uint idx ) {\n"
             "  atomic_store( &var, from_buf(src[0]) );\n"
@@ -811,9 +979,10 @@
             "  atomic_store( &a_var[1], from_buf(src[3]) );\n"
             "  p_var = a_var + idx;\n"
-        num_printed = snprintf(writer_src,sizeof(writer_src),writer_template_atomic,ti.get_buf_elem_type());
+        num_printed = snprintf(writer_src, sizeof(writer_src),
+                               writer_template_atomic, ti.get_buf_elem_type());
-    assert( num_printed < sizeof(writer_src) );
+    assert(num_printed < sizeof(writer_src));
     std::string result = writer_src;
     return result;
@@ -826,7 +995,8 @@
     static char reader_src[MAX_STR];
     int num_printed = 0;
-    if ( !ti.is_atomic() ) {
+    if (!ti.is_atomic())
+    {
         const char* reader_template_normal =
             "kernel void reader( global %s* dest, %s ptr_write_val ) {\n"
             "  *p_var = from_buf(ptr_write_val);\n"
@@ -835,8 +1005,12 @@
             "  dest[2] = to_buf(a_var[0]);\n"
             "  dest[3] = to_buf(a_var[1]);\n"
-        num_printed = snprintf(reader_src,sizeof(reader_src),reader_template_normal,ti.get_buf_elem_type(),ti.get_buf_elem_type());
-    } else {
+        num_printed =
+            snprintf(reader_src, sizeof(reader_src), reader_template_normal,
+                     ti.get_buf_elem_type(), ti.get_buf_elem_type());
+    }
+    else
+    {
         const char* reader_template_atomic =
             "kernel void reader( global %s* dest, %s ptr_write_val ) {\n"
             "  atomic_store( p_var, from_buf(ptr_write_val) );\n"
@@ -845,40 +1019,53 @@
             "  dest[2] = to_buf( atomic_load( &a_var[0] ) );\n"
             "  dest[3] = to_buf( atomic_load( &a_var[1] ) );\n"
-        num_printed = snprintf(reader_src,sizeof(reader_src),reader_template_atomic,ti.get_buf_elem_type(),ti.get_buf_elem_type());
+        num_printed =
+            snprintf(reader_src, sizeof(reader_src), reader_template_atomic,
+                     ti.get_buf_elem_type(), ti.get_buf_elem_type());
-    assert( num_printed < sizeof(reader_src) );
+    assert(num_printed < sizeof(reader_src));
     std::string result = reader_src;
     return result;
 // Check that all globals where appropriately default-initialized.
-static int check_global_initialization(cl_context context, cl_program program, cl_command_queue queue)
+static int check_global_initialization(cl_context context, cl_program program,
+                                       cl_command_queue queue)
     int status = CL_SUCCESS;
     // Create a buffer on device to store a unique integer.
     cl_int is_init_valid = 0;
-    clMemWrapper buffer(clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(is_init_valid), &is_init_valid, &status));
+    clMemWrapper buffer(
+        clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                       sizeof(is_init_valid), &is_init_valid, &status));
     test_error_ret(status, "Failed to allocate buffer", status);
     // Create, setup and invoke kernel.
-    clKernelWrapper global_check(clCreateKernel(program, "global_check", &status));
+    clKernelWrapper global_check(
+        clCreateKernel(program, "global_check", &status));
     test_error_ret(status, "Failed to create global_check kernel", status);
     status = clSetKernelArg(global_check, 0, sizeof(cl_mem), &buffer);
-    test_error_ret(status, "Failed to set up argument for the global_check kernel", status);
+    test_error_ret(status,
+                   "Failed to set up argument for the global_check kernel",
+                   status);
     const cl_uint work_dim = 1;
     const size_t global_work_offset[] = { 0 };
     const size_t global_work_size[] = { 1 };
-    status = clEnqueueNDRangeKernel(queue, global_check, work_dim, global_work_offset, global_work_size, nullptr, 0, nullptr, nullptr);
+    status = clEnqueueNDRangeKernel(queue, global_check, work_dim,
+                                    global_work_offset, global_work_size,
+                                    nullptr, 0, nullptr, nullptr);
     test_error_ret(status, "Failed to run global_check kernel", status);
     status = clFinish(queue);
     test_error_ret(status, "clFinish() failed", status);
     // Read back the memory buffer from the device.
-    status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(is_init_valid), &is_init_valid, 0, nullptr, nullptr);
+    status =
+        clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(is_init_valid),
+                            &is_init_valid, 0, nullptr, nullptr);
     test_error_ret(status, "Failed to read buffer from device", status);
-    if (is_init_valid == 0) {
+    if (is_init_valid == 0)
+    {
         log_error("Unexpected default values were detected");
         return 1;
@@ -887,58 +1074,75 @@
 // Check write-then-read.
-static int l_write_read( cl_device_id device, cl_context context, cl_command_queue queue )
+static int l_write_read(cl_device_id device, cl_context context,
+                        cl_command_queue queue)
     int status = CL_SUCCESS;
     int itype;
-    RandomSeed rand_state( gRandomSeed );
+    RandomSeed rand_state(gRandomSeed);
-    for ( itype = 0; itype < num_type_info ; itype++ ) {
-        status = status | l_write_read_for_type(device,context,queue,type_info[itype], rand_state );
+    for (itype = 0; itype < num_type_info; itype++)
+    {
+        status = status
+            | l_write_read_for_type(device, context, queue, type_info[itype],
+                                    rand_state);
     return status;
-static int l_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state )
+static int l_write_read_for_type(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, const TypeInfo& ti,
+                                 RandomSeed& rand_state)
     int err = CL_SUCCESS;
-    std::string type_name( ti.get_name() );
+    std::string type_name(ti.get_name());
     const char* tn = type_name.c_str();
-    log_info("  %s ",tn);
+    log_info("  %s ", tn);
     StringTable ksrc;
-    ksrc.add( l_get_fp64_pragma() );
-    ksrc.add( l_get_cles_int64_pragma() );
-    if (ti.is_atomic_64bit())
-      ksrc.add( l_get_int64_atomic_pragma() );
-    ksrc.add( conversion_functions(ti) );
-    ksrc.add( global_decls(ti,false) );
-    ksrc.add( global_check_function(ti) );
-    ksrc.add( writer_function(ti) );
-    ksrc.add( reader_function(ti) );
+    ksrc.add(l_get_fp64_pragma());
+    ksrc.add(l_get_cles_int64_pragma());
+    if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma());
+    ksrc.add(conversion_functions(ti));
+    ksrc.add(global_decls(ti, false));
+    ksrc.add(global_check_function(ti));
+    ksrc.add(writer_function(ti));
+    ksrc.add(reader_function(ti));
     int status = CL_SUCCESS;
     clProgramWrapper program;
     clKernelWrapper writer;
-    status = create_single_kernel_helper_with_build_options(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", OPTIONS);
-    test_error_ret(status,"Failed to create program for read-after-write test",status);
+    status = create_single_kernel_helper_with_build_options(
+        context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer",
+        OPTIONS);
+    test_error_ret(status, "Failed to create program for read-after-write test",
+                   status);
-    clKernelWrapper reader( clCreateKernel( program, "reader", &status ) );
-    test_error_ret(status,"Failed to create reader kernel for read-after-write test",status);
+    clKernelWrapper reader(clCreateKernel(program, "reader", &status));
+    test_error_ret(status,
+                   "Failed to create reader kernel for read-after-write test",
+                   status);
     // Check size query.
     size_t used_bytes = 0;
-    status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 );
-    test_error_ret(status,"Failed to query global variable total size",status);
-    size_t expected_used_bytes =
-        (NUM_TESTED_VALUES-1)*ti.get_size() // Two regular variables and an array of 2 elements.
-        + ( l_64bit_device ? 8 : 4 ); // The pointer
-    if ( used_bytes < expected_used_bytes ) {
-        log_error("Error program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes );
+    status = clGetProgramBuildInfo(program, device,
+                                   CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE,
+                                   sizeof(used_bytes), &used_bytes, 0);
+    test_error_ret(status, "Failed to query global variable total size",
+                   status);
+    size_t expected_used_bytes = (NUM_TESTED_VALUES - 1)
+            * ti.get_size() // Two regular variables and an array of 2 elements.
+        + (l_64bit_device ? 8 : 4); // The pointer
+    if (used_bytes < expected_used_bytes)
+    {
+        log_error("Error program query for global variable total size query "
+                  "failed: Expected at least %llu but got %llu\n",
+                  (unsigned long long)expected_used_bytes,
+                  (unsigned long long)used_bytes);
         err |= 1;
@@ -951,90 +1155,131 @@
     cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT);
     cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT);
-    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) );
-    test_error_ret(status,"Failed to allocate write buffer",status);
-    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) );
-    test_error_ret(status,"Failed to allocate read buffer",status);
+    clMemWrapper write_mem(clCreateBuffer(
+        context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status));
+    test_error_ret(status, "Failed to allocate write buffer", status);
+    clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                         read_data_size, read_data, &status));
+    test_error_ret(status, "Failed to allocate read buffer", status);
-    status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status);
-    status = clSetKernelArg(reader,0,sizeof(cl_mem),&read_mem); test_error_ret(status,"set arg",status);
+    status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem);
+    test_error_ret(status, "set arg", status);
+    status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem);
+    test_error_ret(status, "set arg", status);
     // Boolean random data needs to be massaged a bit more.
-    const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES ) : NUM_ROUNDS;
+    const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS;
     unsigned bool_iter = 0;
-    for ( int iround = 0; iround < num_rounds ; iround++ ) {
-        for ( cl_uint iptr_idx = 0; iptr_idx < 2 ; iptr_idx++ ) { // Index into array, to write via pointer
+    for (int iround = 0; iround < num_rounds; iround++)
+    {
+        for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++)
+        { // Index into array, to write via pointer
             // Generate new random data to push through.
-            // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that.
+            // Generate 5 * 128 bytes all the time, even though the test for
+            // many types use less than all that.
-            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0);
+            cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0,
+                0, 0, 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // For boolean, random data cast to bool isn't very random.
                 // So use the bottom bit of bool_value_iter to get true
                 // diversity.
-                for ( unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES ; value_idx++ ) {
-                    write_data[value_idx] = (1<<value_idx) & bool_iter;
-                    //printf(" %s", (write_data[value_idx] ? "true" : "false" ));
+                for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES;
+                     value_idx++)
+                {
+                    write_data[value_idx] = (1 << value_idx) & bool_iter;
+                    // printf(" %s", (write_data[value_idx] ? "true" : "false"
+                    // ));
-            } else {
-                l_set_randomly( write_data, write_data_size, rand_state );
-            status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status);
+            else
+            {
+                l_set_randomly(write_data, write_data_size, rand_state);
+            }
+            status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx);
+            test_error_ret(status, "set arg", status);
             // The value to write via the pointer should be taken from the
             // 5th typed slot of the write_data.
-            status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status);
+            status = clSetKernelArg(
+                reader, 1, ti.get_size(),
+                write_data + (NUM_TESTED_VALUES - 1) * ti.get_size());
+            test_error_ret(status, "set arg", status);
             // Determine the expected values.
             cl_uchar expected[read_data_size];
-            memset( expected, -1, sizeof(expected) );
-            l_copy( expected, 0, write_data, 0, ti );
-            l_copy( expected, 1, write_data, 1, ti );
-            l_copy( expected, 2, write_data, 2, ti );
-            l_copy( expected, 3, write_data, 3, ti );
-            // But we need to take into account the value from the pointer write.
-            // The 2 represents where the "a" array values begin in our read-back.
-            l_copy( expected, 2 + iptr_idx, write_data, 4, ti );
+            memset(expected, -1, sizeof(expected));
+            l_copy(expected, 0, write_data, 0, ti);
+            l_copy(expected, 1, write_data, 1, ti);
+            l_copy(expected, 2, write_data, 2, ti);
+            l_copy(expected, 3, write_data, 3, ti);
+            // But we need to take into account the value from the pointer
+            // write. The 2 represents where the "a" array values begin in our
+            // read-back.
+            l_copy(expected, 2 + iptr_idx, write_data, 4, ti);
             clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // Collapse down to one bit.
-                for ( unsigned i = 0; i <  NUM_TESTED_VALUES-1 ; i++ ) expected[i] = (bool)expected[i];
+                for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++)
+                    expected[i] = (bool)expected[i];
-            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
+            cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0,
+                0, 0);
             memset(read_data, -1, read_data_size);
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
             // Now run the kernel
             const size_t one = 1;
-            status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status);
-            status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
-            status = clFinish(queue); test_error_ret(status,"finish",status);
+            status =
+                clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0);
+            test_error_ret(status, "enqueue writer", status);
+            status =
+                clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0);
+            test_error_ret(status, "enqueue reader", status);
+            status = clFinish(queue);
+            test_error_ret(status, "finish", status);
-            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
+            read_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0,
+                0, 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // Collapse down to one bit.
-                for ( unsigned i = 0; i <  NUM_TESTED_VALUES-1 ; i++ ) read_data[i] = (bool)read_data[i];
+                for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++)
+                    read_data[i] = (bool)read_data[i];
             // Compare only the valid returned bytes.
-            int compare_result = l_compare( "read-after-write", expected, read_data, NUM_TESTED_VALUES-1, ti );
-            // log_info("Compared %d values each of size %llu. Result %d\n", NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), compare_result );
+            int compare_result =
+                l_compare("read-after-write", expected, read_data,
+                          NUM_TESTED_VALUES - 1, ti);
+            // log_info("Compared %d values each of size %llu. Result %d\n",
+            // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(),
+            // compare_result );
             err |= compare_result;
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
-            if ( err ) break;
+            if (err) break;
-    if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    if (CL_SUCCESS == err)
+    {
+        log_info("OK\n");
+        FLUSH;
+    }
     return err;
@@ -1042,74 +1287,97 @@
 // Check initialization, then, read, then write, then read.
-static int l_init_write_read( cl_device_id device, cl_context context, cl_command_queue queue )
+static int l_init_write_read(cl_device_id device, cl_context context,
+                             cl_command_queue queue)
     int status = CL_SUCCESS;
     int itype;
-    RandomSeed rand_state( gRandomSeed );
+    RandomSeed rand_state(gRandomSeed);
-    for ( itype = 0; itype < num_type_info ; itype++ ) {
-        status = status | l_init_write_read_for_type(device,context,queue,type_info[itype], rand_state );
+    for (itype = 0; itype < num_type_info; itype++)
+    {
+        status = status
+            | l_init_write_read_for_type(device, context, queue,
+                                         type_info[itype], rand_state);
     return status;
-static int l_init_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state )
+static int l_init_write_read_for_type(cl_device_id device, cl_context context,
+                                      cl_command_queue queue,
+                                      const TypeInfo& ti,
+                                      RandomSeed& rand_state)
     int err = CL_SUCCESS;
-    std::string type_name( ti.get_name() );
+    std::string type_name(ti.get_name());
     const char* tn = type_name.c_str();
-    log_info("  %s ",tn);
+    log_info("  %s ", tn);
     StringTable ksrc;
-    ksrc.add( l_get_fp64_pragma() );
-    ksrc.add( l_get_cles_int64_pragma() );
-    if (ti.is_atomic_64bit())
-      ksrc.add( l_get_int64_atomic_pragma() );
-    ksrc.add( conversion_functions(ti) );
-    ksrc.add( global_decls(ti,true) );
-    ksrc.add( writer_function(ti) );
-    ksrc.add( reader_function(ti) );
+    ksrc.add(l_get_fp64_pragma());
+    ksrc.add(l_get_cles_int64_pragma());
+    if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma());
+    ksrc.add(conversion_functions(ti));
+    ksrc.add(global_decls(ti, true));
+    ksrc.add(writer_function(ti));
+    ksrc.add(reader_function(ti));
     int status = CL_SUCCESS;
     clProgramWrapper program;
     clKernelWrapper writer;
-    status = create_single_kernel_helper_with_build_options(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", OPTIONS);
-    test_error_ret(status,"Failed to create program for init-read-after-write test",status);
+    status = create_single_kernel_helper_with_build_options(
+        context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer",
+        OPTIONS);
+    test_error_ret(status,
+                   "Failed to create program for init-read-after-write test",
+                   status);
-    clKernelWrapper reader( clCreateKernel( program, "reader", &status ) );
-    test_error_ret(status,"Failed to create reader kernel for init-read-after-write test",status);
+    clKernelWrapper reader(clCreateKernel(program, "reader", &status));
+    test_error_ret(
+        status, "Failed to create reader kernel for init-read-after-write test",
+        status);
     // Check size query.
     size_t used_bytes = 0;
-    status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 );
-    test_error_ret(status,"Failed to query global variable total size",status);
-    size_t expected_used_bytes =
-        (NUM_TESTED_VALUES-1)*ti.get_size() // Two regular variables and an array of 2 elements.
-        + ( l_64bit_device ? 8 : 4 ); // The pointer
-    if ( used_bytes < expected_used_bytes ) {
-        log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes );
+    status = clGetProgramBuildInfo(program, device,
+                                   CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE,
+                                   sizeof(used_bytes), &used_bytes, 0);
+    test_error_ret(status, "Failed to query global variable total size",
+                   status);
+    size_t expected_used_bytes = (NUM_TESTED_VALUES - 1)
+            * ti.get_size() // Two regular variables and an array of 2 elements.
+        + (l_64bit_device ? 8 : 4); // The pointer
+    if (used_bytes < expected_used_bytes)
+    {
+        log_error("Error: program query for global variable total size query "
+                  "failed: Expected at least %llu but got %llu\n",
+                  (unsigned long long)expected_used_bytes,
+                  (unsigned long long)used_bytes);
         err |= 1;
     // We need to create 5 random values of the given type,
     // and read 4 of them back.
     const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16);
-    const size_t read_data_size = (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16);
+    const size_t read_data_size = (NUM_TESTED_VALUES - 1) * sizeof(cl_ulong16);
     cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT);
     cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT);
-    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) );
-    test_error_ret(status,"Failed to allocate write buffer",status);
-    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) );
-    test_error_ret(status,"Failed to allocate read buffer",status);
+    clMemWrapper write_mem(clCreateBuffer(
+        context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status));
+    test_error_ret(status, "Failed to allocate write buffer", status);
+    clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                         read_data_size, read_data, &status));
+    test_error_ret(status, "Failed to allocate read buffer", status);
-    status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status);
-    status = clSetKernelArg(reader,0,sizeof(cl_mem),&read_mem); test_error_ret(status,"set arg",status);
+    status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem);
+    test_error_ret(status, "set arg", status);
+    status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem);
+    test_error_ret(status, "set arg", status);
     // Boolean random data needs to be massaged a bit more.
-    const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES ) : NUM_ROUNDS;
+    const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS;
     unsigned bool_iter = 0;
     // We need to count iterations.  We do something *different on the
@@ -1117,107 +1385,152 @@
     // values.
     unsigned iteration = 0;
-    for ( int iround = 0; iround < num_rounds ; iround++ ) {
-        for ( cl_uint iptr_idx = 0; iptr_idx < 2 ; iptr_idx++ ) { // Index into array, to write via pointer
+    for (int iround = 0; iround < num_rounds; iround++)
+    {
+        for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++)
+        { // Index into array, to write via pointer
             // Generate new random data to push through.
-            // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that.
+            // Generate 5 * 128 bytes all the time, even though the test for
+            // many types use less than all that.
-            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0);
+            cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0,
+                0, 0, 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // For boolean, random data cast to bool isn't very random.
                 // So use the bottom bit of bool_value_iter to get true
                 // diversity.
-                for ( unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES ; value_idx++ ) {
-                    write_data[value_idx] = (1<<value_idx) & bool_iter;
-                    //printf(" %s", (write_data[value_idx] ? "true" : "false" ));
+                for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES;
+                     value_idx++)
+                {
+                    write_data[value_idx] = (1 << value_idx) & bool_iter;
+                    // printf(" %s", (write_data[value_idx] ? "true" : "false"
+                    // ));
-            } else {
-                l_set_randomly( write_data, write_data_size, rand_state );
-            status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status);
+            else
+            {
+                l_set_randomly(write_data, write_data_size, rand_state);
+            }
+            status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx);
+            test_error_ret(status, "set arg", status);
-            if ( !iteration ) {
+            if (!iteration)
+            {
                 // On first iteration, the value we write via the last arg
                 // to the "reader" function is 0.
                 // It's way easier to code the test this way.
-                ti.init( write_data + (NUM_TESTED_VALUES-1)*ti.get_size(), 0 );
+                ti.init(write_data + (NUM_TESTED_VALUES - 1) * ti.get_size(),
+                        0);
             // The value to write via the pointer should be taken from the
             // 5th typed slot of the write_data.
-            status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status);
+            status = clSetKernelArg(
+                reader, 1, ti.get_size(),
+                write_data + (NUM_TESTED_VALUES - 1) * ti.get_size());
+            test_error_ret(status, "set arg", status);
             // Determine the expected values.
             cl_uchar expected[read_data_size];
-            memset( expected, -1, sizeof(expected) );
-            if ( iteration ) {
-                l_copy( expected, 0, write_data, 0, ti );
-                l_copy( expected, 1, write_data, 1, ti );
-                l_copy( expected, 2, write_data, 2, ti );
-                l_copy( expected, 3, write_data, 3, ti );
-                // But we need to take into account the value from the pointer write.
-                // The 2 represents where the "a" array values begin in our read-back.
-                // But we need to take into account the value from the pointer write.
-                l_copy( expected, 2 + iptr_idx, write_data, 4, ti );
-            } else {
+            memset(expected, -1, sizeof(expected));
+            if (iteration)
+            {
+                l_copy(expected, 0, write_data, 0, ti);
+                l_copy(expected, 1, write_data, 1, ti);
+                l_copy(expected, 2, write_data, 2, ti);
+                l_copy(expected, 3, write_data, 3, ti);
+                // But we need to take into account the value from the pointer
+                // write. The 2 represents where the "a" array values begin in
+                // our read-back. But we need to take into account the value
+                // from the pointer write.
+                l_copy(expected, 2 + iptr_idx, write_data, 4, ti);
+            }
+            else
+            {
                 // On first iteration, expect these initialized values!
                 // See the decls_template_with_init above.
-                ti.init( expected, 0 );
-                ti.init( expected + ti.get_size(), 1 );
-                ti.init( expected + 2*ti.get_size(), 1 );
+                ti.init(expected, 0);
+                ti.init(expected + ti.get_size(), 1);
+                ti.init(expected + 2 * ti.get_size(), 1);
                 // Emulate the effect of the write via the pointer.
                 // The value is 0, not 1 (see above).
                 // The pointer is always initialized to the second element
                 // of the array. So it goes into slot 3 of the "expected" array.
-                ti.init( expected + 3*ti.get_size(), 0 );
+                ti.init(expected + 3 * ti.get_size(), 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // Collapse down to one bit.
-                for ( unsigned i = 0; i <  NUM_TESTED_VALUES-1 ; i++ ) expected[i] = (bool)expected[i];
+                for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++)
+                    expected[i] = (bool)expected[i];
             clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0);
-            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
-            memset( read_data, -1, read_data_size );
+            cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0,
+                0, 0);
+            memset(read_data, -1, read_data_size);
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
             // Now run the kernel
             const size_t one = 1;
-            if ( iteration ) {
-                status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status);
-            } else {
+            if (iteration)
+            {
+                status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0,
+                                                0, 0);
+                test_error_ret(status, "enqueue writer", status);
+            }
+            else
+            {
                 // On first iteration, we should be picking up the
                 // initialized value. So don't enqueue the writer.
-            status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
-            status = clFinish(queue); test_error_ret(status,"finish",status);
+            status =
+                clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0);
+            test_error_ret(status, "enqueue reader", status);
+            status = clFinish(queue);
+            test_error_ret(status, "finish", status);
-            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
+            read_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0,
+                0, 0);
-            if ( ti.is_bool() ) {
+            if (ti.is_bool())
+            {
                 // Collapse down to one bit.
-                for ( unsigned i = 0; i <  NUM_TESTED_VALUES-1 ; i++ ) read_data[i] = (bool)read_data[i];
+                for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++)
+                    read_data[i] = (bool)read_data[i];
             // Compare only the valid returned bytes.
-            //log_info(" Round %d ptr_idx %u\n", iround, iptr_idx );
-            int compare_result = l_compare( "init-write-read", expected, read_data, NUM_TESTED_VALUES-1, ti );
-            //log_info("Compared %d values each of size %llu. Result %d\n", NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), compare_result );
+            // log_info(" Round %d ptr_idx %u\n", iround, iptr_idx );
+            int compare_result =
+                l_compare("init-write-read", expected, read_data,
+                          NUM_TESTED_VALUES - 1, ti);
+            // log_info("Compared %d values each of size %llu. Result %d\n",
+            // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(),
+            // compare_result );
             err |= compare_result;
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
-            if ( err ) break;
+            if (err) break;
-    if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    if (CL_SUCCESS == err)
+    {
+        log_info("OK\n");
+        FLUSH;
+    }
@@ -1226,12 +1539,14 @@
 // Check that we can make at least one variable with size
-// max_size which is returned from the device info property : CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE.
-static int l_capacity( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size )
+// max_size which is returned from the device info property :
+static int l_capacity(cl_device_id device, cl_context context,
+                      cl_command_queue queue, size_t max_size)
     int err = CL_SUCCESS;
     // Just test one type.
-    const TypeInfo ti( l_find_type("uchar") );
+    const TypeInfo ti(l_find_type("uchar"));
     log_info(" l_capacity...");
     const char prog_src_template[] =
@@ -1254,83 +1569,132 @@
         "  dest[get_global_linear_id()] = var[get_global_id(0)];\n"
     char prog_src[MAX_STR];
-    int num_printed = snprintf(prog_src,sizeof(prog_src),prog_src_template,max_size, max_size);
-    assert( num_printed < MAX_STR ); // or increase MAX_STR
+    int num_printed = snprintf(prog_src, sizeof(prog_src), prog_src_template,
+                               max_size, max_size);
+    assert(num_printed < MAX_STR); // or increase MAX_STR
+    (void)num_printed;
     StringTable ksrc;
-    ksrc.add( prog_src );
+    ksrc.add(prog_src);
     int status = CL_SUCCESS;
     clProgramWrapper program;
     clKernelWrapper get_max_size;
-    status = create_single_kernel_helper_with_build_options(context, &program, &get_max_size, ksrc.num_str(), ksrc.strs(), "get_max_size", OPTIONS);
-    test_error_ret(status,"Failed to create program for capacity test",status);
+    status = create_single_kernel_helper_with_build_options(
+        context, &program, &get_max_size, ksrc.num_str(), ksrc.strs(),
+        "get_max_size", OPTIONS);
+    test_error_ret(status, "Failed to create program for capacity test",
+                   status);
     // Check size query.
     size_t used_bytes = 0;
-    status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 );
-    test_error_ret(status,"Failed to query global variable total size",status);
-    if ( used_bytes < max_size ) {
-        log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)max_size, (unsigned long long)used_bytes );
+    status = clGetProgramBuildInfo(program, device,
+                                   CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE,
+                                   sizeof(used_bytes), &used_bytes, 0);
+    test_error_ret(status, "Failed to query global variable total size",
+                   status);
+    if (used_bytes < max_size)
+    {
+        log_error("Error: program query for global variable total size query "
+                  "failed: Expected at least %llu but got %llu\n",
+                  (unsigned long long)max_size, (unsigned long long)used_bytes);
         err |= 1;
     // Prepare to execute
-    clKernelWrapper writer( clCreateKernel( program, "writer", &status ) );
-    test_error_ret(status,"Failed to create writer kernel for capacity test",status);
-    clKernelWrapper reader( clCreateKernel( program, "reader", &status ) );
-    test_error_ret(status,"Failed to create reader kernel for capacity test",status);
+    clKernelWrapper writer(clCreateKernel(program, "writer", &status));
+    test_error_ret(status, "Failed to create writer kernel for capacity test",
+                   status);
+    clKernelWrapper reader(clCreateKernel(program, "reader", &status));
+    test_error_ret(status, "Failed to create reader kernel for capacity test",
+                   status);
     cl_ulong max_size_ret = 0;
-    const size_t arr_size = 10*1024*1024;
-    cl_uchar* buffer = (cl_uchar*) align_malloc( arr_size, ALIGNMENT );
+    const size_t arr_size = 10 * 1024 * 1024;
+    cl_uchar* buffer = (cl_uchar*)align_malloc(arr_size, ALIGNMENT);
-    if ( !buffer ) { log_error("Failed to allocate buffer\n"); return 1; }
+    if (!buffer)
+    {
+        log_error("Failed to allocate buffer\n");
+        return 1;
+    }
-    clMemWrapper max_size_ret_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(max_size_ret), &max_size_ret, &status ) );
-    test_error_ret(status,"Failed to allocate size query buffer",status);
-    clMemWrapper buffer_mem( clCreateBuffer( context, CL_MEM_READ_WRITE, arr_size, 0, &status ) );
-    test_error_ret(status,"Failed to allocate write buffer",status);
+    clMemWrapper max_size_ret_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                                 sizeof(max_size_ret),
+                                                 &max_size_ret, &status));
+    test_error_ret(status, "Failed to allocate size query buffer", status);
+    clMemWrapper buffer_mem(
+        clCreateBuffer(context, CL_MEM_READ_WRITE, arr_size, 0, &status));
+    test_error_ret(status, "Failed to allocate write buffer", status);
-    status = clSetKernelArg(get_max_size,0,sizeof(cl_mem),&max_size_ret_mem); test_error_ret(status,"set arg",status);
-    status = clSetKernelArg(writer,0,sizeof(cl_mem),&buffer_mem); test_error_ret(status,"set arg",status);
-    status = clSetKernelArg(reader,0,sizeof(cl_mem),&buffer_mem); test_error_ret(status,"set arg",status);
+    status = clSetKernelArg(get_max_size, 0, sizeof(cl_mem), &max_size_ret_mem);
+    test_error_ret(status, "set arg", status);
+    status = clSetKernelArg(writer, 0, sizeof(cl_mem), &buffer_mem);
+    test_error_ret(status, "set arg", status);
+    status = clSetKernelArg(reader, 0, sizeof(cl_mem), &buffer_mem);
+    test_error_ret(status, "set arg", status);
     // Check the macro value of CL_DEVICE_MAX_GLOBAL_VARIABLE
     const size_t one = 1;
-    status = clEnqueueNDRangeKernel(queue,get_max_size,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue size query",status);
-    status = clFinish(queue); test_error_ret(status,"finish",status);
+    status =
+        clEnqueueNDRangeKernel(queue, get_max_size, 1, 0, &one, 0, 0, 0, 0);
+    test_error_ret(status, "enqueue size query", status);
+    status = clFinish(queue);
+    test_error_ret(status, "finish", status);
-    cl_uchar *max_size_ret_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, max_size_ret_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(max_size_ret), 0, 0, 0, 0);
-    if ( max_size_ret != max_size ) {
-        log_error("Error: preprocessor definition for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE is %llu and does not match device query value %llu\n",
-                (unsigned long long) max_size_ret,
-                (unsigned long long) max_size );
+    cl_uchar* max_size_ret_ptr = (cl_uchar*)clEnqueueMapBuffer(
+        queue, max_size_ret_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(max_size_ret),
+        0, 0, 0, 0);
+    if (max_size_ret != max_size)
+    {
+        log_error("Error: preprocessor definition for "
+                  "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE is %llu and does not "
+                  "match device query value %llu\n",
+                  (unsigned long long)max_size_ret,
+                  (unsigned long long)max_size);
         err |= 1;
     clEnqueueUnmapMemObject(queue, max_size_ret_mem, max_size_ret_ptr, 0, 0, 0);
-    RandomSeed rand_state_write( gRandomSeed );
-    for ( size_t offset = 0; offset < max_size ; offset += arr_size ) {
-        size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size;
-        l_set_randomly( buffer, curr_size, rand_state_write );
-        status = clEnqueueWriteBuffer (queue, buffer_mem, CL_TRUE, 0, curr_size, buffer, 0, 0, 0);test_error_ret(status,"populate buffer_mem object",status);
-        status = clEnqueueNDRangeKernel(queue,writer,1,&offset,&curr_size,0,0,0,0); test_error_ret(status,"enqueue writer",status);
-    status = clFinish(queue); test_error_ret(status,"finish",status);
+    RandomSeed rand_state_write(gRandomSeed);
+    for (size_t offset = 0; offset < max_size; offset += arr_size)
+    {
+        size_t curr_size =
+            (max_size - offset) < arr_size ? (max_size - offset) : arr_size;
+        l_set_randomly(buffer, curr_size, rand_state_write);
+        status = clEnqueueWriteBuffer(queue, buffer_mem, CL_TRUE, 0, curr_size,
+                                      buffer, 0, 0, 0);
+        test_error_ret(status, "populate buffer_mem object", status);
+        status = clEnqueueNDRangeKernel(queue, writer, 1, &offset, &curr_size,
+                                        0, 0, 0, 0);
+        test_error_ret(status, "enqueue writer", status);
+        status = clFinish(queue);
+        test_error_ret(status, "finish", status);
-    RandomSeed rand_state_read( gRandomSeed );
-    for ( size_t offset = 0; offset < max_size ; offset += arr_size ) {
-        size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size;
-        status = clEnqueueNDRangeKernel(queue,reader,1,&offset,&curr_size,0,0,0,0); test_error_ret(status,"enqueue reader",status);
-        cl_uchar* read_mem_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, buffer_mem, CL_TRUE, CL_MAP_READ, 0, curr_size, 0, 0, 0, &status);test_error_ret(status,"map read data",status);
-        l_set_randomly( buffer, curr_size, rand_state_read );
-        err |= l_compare( "capacity", buffer, read_mem_ptr, curr_size, ti );
+    RandomSeed rand_state_read(gRandomSeed);
+    for (size_t offset = 0; offset < max_size; offset += arr_size)
+    {
+        size_t curr_size =
+            (max_size - offset) < arr_size ? (max_size - offset) : arr_size;
+        status = clEnqueueNDRangeKernel(queue, reader, 1, &offset, &curr_size,
+                                        0, 0, 0, 0);
+        test_error_ret(status, "enqueue reader", status);
+        cl_uchar* read_mem_ptr = (cl_uchar*)clEnqueueMapBuffer(
+            queue, buffer_mem, CL_TRUE, CL_MAP_READ, 0, curr_size, 0, 0, 0,
+            &status);
+        test_error_ret(status, "map read data", status);
+        l_set_randomly(buffer, curr_size, rand_state_read);
+        err |= l_compare("capacity", buffer, read_mem_ptr, curr_size, ti);
         clEnqueueUnmapMemObject(queue, buffer_mem, read_mem_ptr, 0, 0, 0);
-    if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    if (CL_SUCCESS == err)
+    {
+        log_info("OK\n");
+        FLUSH;
+    }
     return err;
@@ -1338,32 +1702,33 @@
 // Check operation on a user type.
-static int l_user_type( cl_device_id device, cl_context context, cl_command_queue queue, bool separate_compile )
+static int l_user_type(cl_device_id device, cl_context context,
+                       cl_command_queue queue, bool separate_compile)
     int err = CL_SUCCESS;
     // Just test one type.
-    const TypeInfo ti( l_find_type("uchar") );
-    log_info(" l_user_type %s...", separate_compile ? "separate compilation" : "single source compilation" );
+    const TypeInfo ti(l_find_type("uchar"));
+    log_info(" l_user_type %s...",
+             separate_compile ? "separate compilation"
+                              : "single source compilation");
-    if ( separate_compile && ! l_linker_available ) {
+    if (separate_compile && !l_linker_available)
+    {
         log_info("Separate compilation is not supported. Skipping test\n");
         return err;
     const char type_src[] =
         "typedef struct { uchar c; uint i; } my_struct_t;\n\n";
-    const char def_src[] =
-        "my_struct_t var = { 'a', 42 };\n\n";
-    const char decl_src[] =
-        "extern my_struct_t var;\n\n";
+    const char def_src[] = "my_struct_t var = { 'a', 42 };\n\n";
+    const char decl_src[] = "extern my_struct_t var;\n\n";
     // Don't use a host struct. We can't guarantee that the host
     // compiler has the same structure layout as the device compiler.
-    const char writer_src[] =
-        "kernel void writer( uchar c, uint i ) {\n"
-        "  var.c = c;\n"
-        "  var.i = i;\n"
-        "}\n\n";
+    const char writer_src[] = "kernel void writer( uchar c, uint i ) {\n"
+                              "  var.c = c;\n"
+                              "  var.i = i;\n"
+                              "}\n\n";
     const char reader_src[] =
         "kernel void reader( global uchar* C, global uint* I ) {\n"
         "  *C = var.c;\n"
@@ -1372,36 +1737,53 @@
     clProgramWrapper program;
-    if ( separate_compile ) {
+    if (separate_compile)
+    {
         // Separate compilation flow.
         StringTable wksrc;
-        wksrc.add( type_src );
-        wksrc.add( def_src );
-        wksrc.add( writer_src );
+        wksrc.add(type_src);
+        wksrc.add(def_src);
+        wksrc.add(writer_src);
         StringTable rksrc;
-        rksrc.add( type_src );
-        rksrc.add( decl_src );
-        rksrc.add( reader_src );
+        rksrc.add(type_src);
+        rksrc.add(decl_src);
+        rksrc.add(reader_src);
         int status = CL_SUCCESS;
-        clProgramWrapper writer_program( clCreateProgramWithSource( context, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), &status ) );
-        test_error_ret(status,"Failed to create writer program for user type test",status);
+        clProgramWrapper writer_program(clCreateProgramWithSource(
+            context, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), &status));
+        test_error_ret(status,
+                       "Failed to create writer program for user type test",
+                       status);
-        status = clCompileProgram( writer_program, 1, &device, OPTIONS, 0, 0, 0, 0, 0 );
-        if(check_error(status, "Failed to compile writer program for user type test (%s)", IGetErrorString(status)))
+        status = clCompileProgram(writer_program, 1, &device, OPTIONS, 0, 0, 0,
+                                  0, 0);
+        if (check_error(
+                status,
+                "Failed to compile writer program for user type test (%s)",
+                IGetErrorString(status)))
-            print_build_log(writer_program, 1, &device, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), OPTIONS);
+            print_build_log(writer_program, 1, &device, wksrc.num_str(),
+                            wksrc.strs(), wksrc.lengths(), OPTIONS);
             return status;
-        clProgramWrapper reader_program( clCreateProgramWithSource( context, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), &status ) );
-        test_error_ret(status,"Failed to create reader program for user type test",status);
+        clProgramWrapper reader_program(clCreateProgramWithSource(
+            context, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), &status));
+        test_error_ret(status,
+                       "Failed to create reader program for user type test",
+                       status);
-        status = clCompileProgram( reader_program, 1, &device, OPTIONS, 0, 0, 0, 0, 0 );
-        if(check_error(status, "Failed to compile reader program for user type test (%s)", IGetErrorString(status)))
+        status = clCompileProgram(reader_program, 1, &device, OPTIONS, 0, 0, 0,
+                                  0, 0);
+        if (check_error(
+                status,
+                "Failed to compile reader program for user type test (%s)",
+                IGetErrorString(status)))
-            print_build_log(reader_program, 1, &device, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), OPTIONS);
+            print_build_log(reader_program, 1, &device, rksrc.num_str(),
+                            rksrc.strs(), rksrc.lengths(), OPTIONS);
             return status;
@@ -1409,33 +1791,45 @@
         progs[0] = writer_program;
         progs[1] = reader_program;
-        program = clLinkProgram( context, 1, &device, "", 2, progs, 0, 0, &status );
-        if(check_error(status, "Failed to link program for user type test (%s)", IGetErrorString(status)))
+        program =
+            clLinkProgram(context, 1, &device, "", 2, progs, 0, 0, &status);
+        if (check_error(status,
+                        "Failed to link program for user type test (%s)",
+                        IGetErrorString(status)))
             print_build_log(program, 1, &device, 0, NULL, NULL, "");
             return status;
-    } else {
+    }
+    else
+    {
         // Single compilation flow.
         StringTable ksrc;
-        ksrc.add( type_src );
-        ksrc.add( def_src );
-        ksrc.add( writer_src );
-        ksrc.add( reader_src );
+        ksrc.add(type_src);
+        ksrc.add(def_src);
+        ksrc.add(writer_src);
+        ksrc.add(reader_src);
         int status = CL_SUCCESS;
-        status = create_single_kernel_helper_create_program(context, &program, ksrc.num_str(), ksrc.strs(), OPTIONS);
-        if(check_error(status, "Failed to build program for user type test (%s)", IGetErrorString(status)))
+        status = create_single_kernel_helper_create_program(
+            context, &program, ksrc.num_str(), ksrc.strs(), OPTIONS);
+        if (check_error(status,
+                        "Failed to build program for user type test (%s)",
+                        IGetErrorString(status)))
-            print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS);
+            print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(),
+                            ksrc.lengths(), OPTIONS);
             return status;
         status = clBuildProgram(program, 1, &device, OPTIONS, 0, 0);
-        if(check_error(status, "Failed to compile program for user type test (%s)", IGetErrorString(status)))
+        if (check_error(status,
+                        "Failed to compile program for user type test (%s)",
+                        IGetErrorString(status)))
-            print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS);
+            print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(),
+                            ksrc.lengths(), OPTIONS);
             return status;
@@ -1443,48 +1837,71 @@
     // Check size query.
     size_t used_bytes = 0;
-    int status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 );
-    test_error_ret(status,"Failed to query global variable total size",status);
+    int status = clGetProgramBuildInfo(
+        sizeof(used_bytes), &used_bytes, 0);
+    test_error_ret(status, "Failed to query global variable total size",
+                   status);
     size_t expected_size = sizeof(cl_uchar) + sizeof(cl_uint);
-    if ( used_bytes < expected_size ) {
-        log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes );
+    if (used_bytes < expected_size)
+    {
+        log_error("Error: program query for global variable total size query "
+                  "failed: Expected at least %llu but got %llu\n",
+                  (unsigned long long)expected_size,
+                  (unsigned long long)used_bytes);
         err |= 1;
     // Prepare to execute
-    clKernelWrapper writer( clCreateKernel( program, "writer", &status ) );
-    test_error_ret(status,"Failed to create writer kernel for user type test",status);
-    clKernelWrapper reader( clCreateKernel( program, "reader", &status ) );
-    test_error_ret(status,"Failed to create reader kernel for user type test",status);
+    clKernelWrapper writer(clCreateKernel(program, "writer", &status));
+    test_error_ret(status, "Failed to create writer kernel for user type test",
+                   status);
+    clKernelWrapper reader(clCreateKernel(program, "reader", &status));
+    test_error_ret(status, "Failed to create reader kernel for user type test",
+                   status);
     // Set up data.
     cl_uchar* uchar_data = (cl_uchar*)align_malloc(sizeof(cl_uchar), ALIGNMENT);
     cl_uint* uint_data = (cl_uint*)align_malloc(sizeof(cl_uint), ALIGNMENT);
-    clMemWrapper uchar_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status ) );
-    test_error_ret(status,"Failed to allocate uchar buffer",status);
-    clMemWrapper uint_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uint), uint_data, &status ) );
-    test_error_ret(status,"Failed to allocate uint buffer",status);
+    clMemWrapper uchar_mem(clCreateBuffer(
+        context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status));
+    test_error_ret(status, "Failed to allocate uchar buffer", status);
+    clMemWrapper uint_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                         sizeof(cl_uint), uint_data, &status));
+    test_error_ret(status, "Failed to allocate uint buffer", status);
-    status = clSetKernelArg(reader,0,sizeof(cl_mem),&uchar_mem); test_error_ret(status,"set arg",status);
-    status = clSetKernelArg(reader,1,sizeof(cl_mem),&uint_mem); test_error_ret(status,"set arg",status);
+    status = clSetKernelArg(reader, 0, sizeof(cl_mem), &uchar_mem);
+    test_error_ret(status, "set arg", status);
+    status = clSetKernelArg(reader, 1, sizeof(cl_mem), &uint_mem);
+    test_error_ret(status, "set arg", status);
     cl_uchar expected_uchar = 'a';
     cl_uint expected_uint = 42;
-    for ( unsigned iter = 0; iter < 5 ; iter++ ) { // Must go around at least twice
+    for (unsigned iter = 0; iter < 5; iter++)
+    { // Must go around at least twice
         // Read back data
         *uchar_data = -1;
         *uint_data = -1;
         const size_t one = 1;
-        status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
-        status = clFinish(queue); test_error_ret(status,"finish",status);
+        status = clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0);
+        test_error_ret(status, "enqueue reader", status);
+        status = clFinish(queue);
+        test_error_ret(status, "finish", status);
-        cl_uchar *uint_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, 0, 0, 0);
-        cl_uchar *uchar_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0, 0, 0);
+        cl_uchar* uint_data_ptr =
+            (cl_uchar*)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ,
+                                          0, sizeof(cl_uint), 0, 0, 0, 0);
+        cl_uchar* uchar_data_ptr = (cl_uchar*)clEnqueueMapBuffer(
+            queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0,
+            0, 0);
-        if ( expected_uchar != *uchar_data || expected_uint != *uint_data ) {
-            log_error("FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n",
-                    iter, (int)*uchar_data, *uint_data, (int)expected_uchar, expected_uint );
+        if (expected_uchar != *uchar_data || expected_uint != *uint_data)
+        {
+            log_error(
+                "FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n",
+                iter, (int)*uchar_data, *uint_data, (int)expected_uchar,
+                expected_uint);
             err |= 1;
@@ -1498,13 +1915,21 @@
         // Write the new values into persistent store.
         *uchar_data = expected_uchar;
         *uint_data = expected_uint;
-        status = clSetKernelArg(writer,0,sizeof(cl_uchar),uchar_data); test_error_ret(status,"set arg",status);
-        status = clSetKernelArg(writer,1,sizeof(cl_uint),uint_data); test_error_ret(status,"set arg",status);
-        status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status);
-        status = clFinish(queue); test_error_ret(status,"finish",status);
+        status = clSetKernelArg(writer, 0, sizeof(cl_uchar), uchar_data);
+        test_error_ret(status, "set arg", status);
+        status = clSetKernelArg(writer, 1, sizeof(cl_uint), uint_data);
+        test_error_ret(status, "set arg", status);
+        status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0);
+        test_error_ret(status, "enqueue writer", status);
+        status = clFinish(queue);
+        test_error_ret(status, "finish", status);
-    if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    if (CL_SUCCESS == err)
+    {
+        log_info("OK\n");
+        FLUSH;
+    }
     return err;
@@ -1539,7 +1964,8 @@
 // Test support for variables at program scope. Miscellaneous
-int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_progvar_prog_scope_misc(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     cl_bool skip{ CL_FALSE };
     auto error = should_skip(device, skip);
@@ -1558,19 +1984,20 @@
     cl_int err = CL_SUCCESS;
-    err = l_get_device_info( device, &max_size, &pref_size );
-    err |= l_build_type_table( device );
+    err = l_get_device_info(device, &max_size, &pref_size);
+    err |= l_build_type_table(device);
-    err |= l_capacity( device, context, queue, max_size );
-    err |= l_user_type( device, context, queue, false );
-    err |= l_user_type( device, context, queue, true );
+    err |= l_capacity(device, context, queue, max_size);
+    err |= l_user_type(device, context, queue, false);
+    err |= l_user_type(device, context, queue, true);
     return err;
 // Test support for variables at program scope. Unitialized data
-int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements)
     cl_bool skip{ CL_FALSE };
     auto error = should_skip(device, skip);
@@ -1590,16 +2017,17 @@
     cl_int err = CL_SUCCESS;
-    err = l_get_device_info( device, &max_size, &pref_size );
-    err |= l_build_type_table( device );
+    err = l_get_device_info(device, &max_size, &pref_size);
+    err |= l_build_type_table(device);
-    err |= l_write_read( device, context, queue );
+    err |= l_write_read(device, context, queue);
     return err;
 // Test support for variables at program scope. Initialized data.
-int test_progvar_prog_scope_init(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_progvar_prog_scope_init(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements)
     cl_bool skip{ CL_FALSE };
     auto error = should_skip(device, skip);
@@ -1618,17 +2046,18 @@
     cl_int err = CL_SUCCESS;
-    err = l_get_device_info( device, &max_size, &pref_size );
-    err |= l_build_type_table( device );
+    err = l_get_device_info(device, &max_size, &pref_size);
+    err |= l_build_type_table(device);
-    err |= l_init_write_read( device, context, queue );
+    err |= l_init_write_read(device, context, queue);
     return err;
 // A simple test for support of static variables inside a kernel.
-int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_progvar_func_scope(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements)
     cl_bool skip{ CL_FALSE };
     auto error = should_skip(device, skip);
@@ -1642,56 +2071,70 @@
                  "supported on this device\n");
         return TEST_SKIPPED_ITSELF;
-    size_t max_size = 0;
-    size_t pref_size = 0;
     cl_int err = CL_SUCCESS;
     // Deliberately have two variables with the same name but in different
     // scopes.
     // Also, use a large initialized structure in both cases.
+    // clang-format off
     const char prog_src[] =
         "typedef struct { char c; int16 i; } mystruct_t;\n"
-        "kernel void test_bump( global int* value, int which ) {\n"
-        "  if ( which ) {\n"
+        "kernel void test_bump(global int* value, int which) {\n"
+        "  if (which) {\n"
         // Explicit address space.
         // Last element set to 0
-        "     static global mystruct_t persistent = {'a',(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0) };\n"
+        "     static global mystruct_t persistent = { 'a', (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0) };\n"
         "     *value = persistent.i.sf++;\n"
         "  } else {\n"
         // Implicitly global
         // Last element set to 100
-        "     static mystruct_t persistent = {'b',(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,100) };\n"
+        "     static mystruct_t persistent = { 'b' , (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,100) };\n"
         "     *value = persistent.i.sf++;\n"
         "  }\n"
+    // clang-format on
     StringTable ksrc;
-    ksrc.add( prog_src );
+    ksrc.add(prog_src);
     int status = CL_SUCCESS;
     clProgramWrapper program;
     clKernelWrapper test_bump;
-    status = create_single_kernel_helper_with_build_options(context, &program, &test_bump, ksrc.num_str(), ksrc.strs(), "test_bump", OPTIONS);
-    test_error_ret(status, "Failed to create program for function static variable test", status);
+    status = create_single_kernel_helper_with_build_options(
+        context, &program, &test_bump, ksrc.num_str(), ksrc.strs(), "test_bump",
+        OPTIONS);
+    test_error_ret(status,
+                   "Failed to create program for function static variable test",
+                   status);
     // Check size query.
     size_t used_bytes = 0;
-    status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 );
-    test_error_ret(status,"Failed to query global variable total size",status);
+    status = clGetProgramBuildInfo(program, device,
+                                   CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE,
+                                   sizeof(used_bytes), &used_bytes, 0);
+    test_error_ret(status, "Failed to query global variable total size",
+                   status);
     size_t expected_size = 2 * sizeof(cl_int); // Two ints.
-    if ( used_bytes < expected_size ) {
-        log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes );
+    if (used_bytes < expected_size)
+    {
+        log_error("Error: program query for global variable total size query "
+                  "failed: Expected at least %llu but got %llu\n",
+                  (unsigned long long)expected_size,
+                  (unsigned long long)used_bytes);
         err |= 1;
     // Prepare the data.
     cl_int counter_value = 0;
-    clMemWrapper counter_value_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(counter_value), &counter_value, &status ) );
-    test_error_ret(status,"Failed to allocate counter query buffer",status);
+    clMemWrapper counter_value_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                                  sizeof(counter_value),
+                                                  &counter_value, &status));
+    test_error_ret(status, "Failed to allocate counter query buffer", status);
-    status = clSetKernelArg(test_bump,0,sizeof(cl_mem),&counter_value_mem); test_error_ret(status,"set arg",status);
+    status = clSetKernelArg(test_bump, 0, sizeof(cl_mem), &counter_value_mem);
+    test_error_ret(status, "set arg", status);
     // Go a few rounds, alternating between the two counters in the kernel.
@@ -1701,26 +2144,41 @@
     cl_int expected_counter[2] = { 100, 0 };
     const size_t one = 1;
-    for ( int iround = 0; iround < 5 ; iround++ ) { // Must go at least twice around
-        for ( int iwhich = 0; iwhich < 2 ; iwhich++ ) { // Cover both counters
-            status = clSetKernelArg(test_bump,1,sizeof(iwhich),&iwhich); test_error_ret(status,"set arg",status);
-            status = clEnqueueNDRangeKernel(queue,test_bump,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue test_bump",status);
-            status = clFinish(queue); test_error_ret(status,"finish",status);
+    for (int iround = 0; iround < 5; iround++)
+    { // Must go at least twice around
+        for (int iwhich = 0; iwhich < 2; iwhich++)
+        { // Cover both counters
+            status = clSetKernelArg(test_bump, 1, sizeof(iwhich), &iwhich);
+            test_error_ret(status, "set arg", status);
+            status = clEnqueueNDRangeKernel(queue, test_bump, 1, 0, &one, 0, 0,
+                                            0, 0);
+            test_error_ret(status, "enqueue test_bump", status);
+            status = clFinish(queue);
+            test_error_ret(status, "finish", status);
-            cl_uchar *counter_value_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, counter_value_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(counter_value), 0, 0, 0, 0);
+            cl_uchar* counter_value_ptr = (cl_uchar*)clEnqueueMapBuffer(
+                queue, counter_value_mem, CL_TRUE, CL_MAP_READ, 0,
+                sizeof(counter_value), 0, 0, 0, 0);
-            if ( counter_value != expected_counter[iwhich] ) {
-                log_error("Error: Round %d on counter %d: Expected %d but got %d\n",
-                        iround, iwhich, expected_counter[iwhich], counter_value );
+            if (counter_value != expected_counter[iwhich])
+            {
+                log_error(
+                    "Error: Round %d on counter %d: Expected %d but got %d\n",
+                    iround, iwhich, expected_counter[iwhich], counter_value);
                 err |= 1;
             expected_counter[iwhich]++; // Emulate behaviour of the kernel.
-            clEnqueueUnmapMemObject(queue, counter_value_mem, counter_value_ptr, 0, 0, 0);
+            clEnqueueUnmapMemObject(queue, counter_value_mem, counter_value_ptr,
+                                    0, 0, 0);
-    if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    if (CL_SUCCESS == err)
+    {
+        log_info("OK\n");
+        FLUSH;
+    }
     return err;
diff --git a/test_conformance/basic/test_queue_priority.cpp b/test_conformance/basic/test_queue_priority.cpp
index 57ce504..ff6283c 100644
--- a/test_conformance/basic/test_queue_priority.cpp
+++ b/test_conformance/basic/test_queue_priority.cpp
@@ -48,13 +48,9 @@
 "    dst[tid] = srcA[tid] * srcB[tid];\n"
-static const float    MAX_ERR = 1e-5f;
 static int
 verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n, int fileNum)
-    float       r;
     int         i;
     float * reference_ptr = (float *)malloc(n * sizeof(float));
@@ -82,7 +78,6 @@
 static int
 verify_fpsub(float *inptrA, float *inptrB, float *outptr, int n, int fileNum)
-    float       r;
     int         i;
     float * reference_ptr = (float *)malloc(n * sizeof(float));
@@ -110,7 +105,6 @@
 static int
 verify_fpmul(float *inptrA, float *inptrB, float *outptr, int n, int fileNum)
-    float       r;
     int         i;
     float * reference_ptr = (float *)malloc(n * sizeof(float));
diff --git a/test_conformance/basic/test_readimage3d.cpp b/test_conformance/basic/test_readimage3d.cpp
index 1337c9f..5fd7d10 100644
--- a/test_conformance/basic/test_readimage3d.cpp
+++ b/test_conformance/basic/test_readimage3d.cpp
@@ -142,7 +142,7 @@
 	int img_width = 64;
 	int img_height = 64;
 	int img_depth = 64;
-	int i, err;
+	int err;
 	size_t origin[3] = {0, 0, 0};
 	size_t region[3] = {img_width, img_height, img_depth};
 	size_t length = img_width * img_height * img_depth * 4 * sizeof(float);
diff --git a/test_conformance/basic/test_simple_image_pitch.cpp b/test_conformance/basic/test_simple_image_pitch.cpp
index 1cd82b6..2eb43b3 100644
--- a/test_conformance/basic/test_simple_image_pitch.cpp
+++ b/test_conformance/basic/test_simple_image_pitch.cpp
@@ -83,7 +83,7 @@
-  return CL_SUCCESS;
+  return errors == 0 ? TEST_PASS : TEST_FAIL;
 int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements)
@@ -149,5 +149,5 @@
-  return CL_SUCCESS;
+  return errors == 0 ? TEST_PASS : TEST_FAIL;
diff --git a/test_conformance/basic/test_sizeof.cpp b/test_conformance/basic/test_sizeof.cpp
index 66a6c56..e980ed6 100644
--- a/test_conformance/basic/test_sizeof.cpp
+++ b/test_conformance/basic/test_sizeof.cpp
@@ -35,9 +35,9 @@
-    cl_program  p;
-    cl_kernel   k;
-    cl_mem      m;
+    clProgramWrapper p;
+    clKernelWrapper k;
+    clMemWrapper m;
     cl_uint        temp;
@@ -51,42 +51,19 @@
     cl_int err = create_single_kernel_helper_with_build_options(
         context, &p, &k, 4, sizeof_kernel_code, "test_sizeof", nullptr);
-    if( err )
-        return err;
+    test_error(err, "Failed to build kernel/program.");
     m = clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof( cl_ulong ), size, &err );
-    if( NULL == m )
-    {
-        clReleaseProgram( p );
-        clReleaseKernel( k );
-        log_error("\nclCreateBuffer FAILED\n");
-        return err;
-    }
+    test_error(err, "clCreateBuffer failed.");
     err = clSetKernelArg( k, 0, sizeof( cl_mem ), &m );
-    if( err )
-    {
-        clReleaseProgram( p );
-        clReleaseKernel( k );
-        clReleaseMemObject( m );
-        log_error("\nclSetKernelArg FAILED\n");
-        return err;
-    }
+    test_error(err, "clSetKernelArg failed.");
     err = clEnqueueTask( queue, k, 0, NULL, NULL );
-    clReleaseProgram( p );
-    clReleaseKernel( k );
-    if( err )
-    {
-        clReleaseMemObject( m );
-        log_error( "\nclEnqueueTask FAILED\n" );
-        return err;
-    }
+    test_error(err, "clEnqueueTask failed.");
     err = clEnqueueReadBuffer( queue, m, CL_TRUE, 0, sizeof( cl_uint ), &temp, 0, NULL, NULL );
-    clReleaseMemObject( m );
-    if( err )
-        log_error( "\nclEnqueueReadBuffer FAILED\n" );
+    test_error(err, "clEnqueueReadBuffer failed.");
     *size = (cl_ulong) temp;
@@ -292,11 +269,11 @@
-        if( gIsEmbedded &&
-           0 == strcmp(other_types[i], "image3d_t") &&
-           checkFor3DImageSupport( device ) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
+        if (0 == strcmp(other_types[i], "image3d_t")
+            && checkFor3DImageSupport(device) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
-            log_info("\n3D images are not supported by this device. Skipping test.\t");
+            log_info("\n3D images are not supported by this device. "
+                     "Skipping test.\t");
diff --git a/test_conformance/basic/test_vector_swizzle.cpp b/test_conformance/basic/test_vector_swizzle.cpp
index 5ab3ea4..884bcf3 100644
--- a/test_conformance/basic/test_vector_swizzle.cpp
+++ b/test_conformance/basic/test_vector_swizzle.cpp
@@ -610,9 +610,6 @@
     cl_int error = CL_SUCCESS;
     int result = TEST_PASS;
-    clProgramWrapper program;
-    clKernelWrapper kernel;
     std::string buildOptions{ "-DTYPE=" };
     buildOptions += type_name;
     buildOptions += std::to_string(N);
@@ -628,37 +625,52 @@
     makeReference<T, N, S>(reference);
     // XYZW swizzles:
-    const char* xyzw_source = TestInfo<N>::kernel_source_xyzw;
-    error = create_single_kernel_helper(
-        context, &program, &kernel, 1, &xyzw_source, "test_vector_swizzle_xyzw",
-        buildOptions.c_str());
-    test_error(error, "Unable to create xyzw test kernel");
-    result |= test_vectype_case(value, reference, context, kernel, queue);
-    // sN swizzles:
-    const char* sN_source = TestInfo<N>::kernel_source_sN;
-    error = create_single_kernel_helper(context, &program, &kernel, 1,
-                                        &sN_source, "test_vector_swizzle_sN",
-                                        buildOptions.c_str());
-    test_error(error, "Unable to create sN test kernel");
-    result |= test_vectype_case(value, reference, context, kernel, queue);
-    // RGBA swizzles for OpenCL 3.0 and newer:
-    const Version device_version = get_device_cl_version(device);
-    if (device_version >= Version(3, 0))
-        const char* rgba_source = TestInfo<N>::kernel_source_rgba;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        const char* xyzw_source = TestInfo<N>::kernel_source_xyzw;
         error = create_single_kernel_helper(
-            context, &program, &kernel, 1, &rgba_source,
-            "test_vector_swizzle_rgba", buildOptions.c_str());
-        test_error(error, "Unable to create rgba test kernel");
+            context, &program, &kernel, 1, &xyzw_source,
+            "test_vector_swizzle_xyzw", buildOptions.c_str());
+        test_error(error, "Unable to create xyzw test kernel");
         result |= test_vectype_case(value, reference, context, kernel, queue);
+    // sN swizzles:
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        const char* sN_source = TestInfo<N>::kernel_source_sN;
+        error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &sN_source, "test_vector_swizzle_sN",
+            buildOptions.c_str());
+        test_error(error, "Unable to create sN test kernel");
+        result |= test_vectype_case(value, reference, context, kernel, queue);
+    }
+    // RGBA swizzles for OpenCL 3.0 and newer:
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        const Version device_version = get_device_cl_version(device);
+        if (device_version >= Version(3, 0))
+        {
+            const char* rgba_source = TestInfo<N>::kernel_source_rgba;
+            error = create_single_kernel_helper(
+                context, &program, &kernel, 1, &rgba_source,
+                "test_vector_swizzle_rgba", buildOptions.c_str());
+            test_error(error, "Unable to create rgba test kernel");
+            result |=
+                test_vectype_case(value, reference, context, kernel, queue);
+        }
+    }
     return result;
diff --git a/test_conformance/basic/test_writeimage_fp32.cpp b/test_conformance/basic/test_writeimage_fp32.cpp
index fef7187..c68463a 100644
--- a/test_conformance/basic/test_writeimage_fp32.cpp
+++ b/test_conformance/basic/test_writeimage_fp32.cpp
@@ -122,9 +122,10 @@
         return -1;
-  err = create_single_kernel_helper(context, &program, &kernel[0], 1, &rgbaFFFF_write_kernel_code, "test_rgbaFFFF_write" );
-  if (err)
-    return -1;
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1,
+                                      &rgbaFFFF_write_kernel_code,
+                                      "test_rgbaFFFF_write");
+    if (err) return -1;
     kernel[1] = clCreateKernel(program, "test_rgbaFFFF_write", NULL);
     if (!kernel[1])
diff --git a/test_conformance/basic/test_writeimage_int16.cpp b/test_conformance/basic/test_writeimage_int16.cpp
index 8afb77a..d863a3a 100644
--- a/test_conformance/basic/test_writeimage_int16.cpp
+++ b/test_conformance/basic/test_writeimage_int16.cpp
@@ -128,9 +128,10 @@
         return -1;
-  err = create_single_kernel_helper(context, &program, &kernel[0], 1, &rgba16_write_kernel_code, "test_rgba16_write" );
-  if (err)
-    return -1;
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1,
+                                      &rgba16_write_kernel_code,
+                                      "test_rgba16_write");
+    if (err) return -1;
     kernel[1] = clCreateKernel(program, "test_rgba16_write", NULL);
     if (!kernel[1])
diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 9c9c7d1..9207979 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -703,8 +703,6 @@
 int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
     TestStruct pattern;
-    clProgramWrapper program;
-    clKernelWrapper kernel;
     size_t      ptrSize = sizeof( TestStruct );
     size_t      global_work_size[3];
     int         n, err;
@@ -720,6 +718,8 @@
     for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        clProgramWrapper program;
+        clKernelWrapper kernel;
         log_info("Testing with cl_mem_flags: %s\n",
diff --git a/test_conformance/buffers/test_buffer_migrate.cpp b/test_conformance/buffers/test_buffer_migrate.cpp
index f309836..6cdc271 100644
--- a/test_conformance/buffers/test_buffer_migrate.cpp
+++ b/test_conformance/buffers/test_buffer_migrate.cpp
@@ -80,7 +80,7 @@
 static cl_int restoreBuffer(cl_command_queue *queues, cl_mem *buffers, cl_uint num_devices, cl_mem_migration_flags *flags, cl_uint *buffer)
-  cl_uint i, j;
+  cl_uint i;
   cl_int  err;
   // If the buffer was previously migrated with undefined content, reload the content.
diff --git a/test_conformance/buffers/test_buffer_read.cpp b/test_conformance/buffers/test_buffer_read.cpp
index 39cf329..49a57f9 100644
--- a/test_conformance/buffers/test_buffer_read.cpp
+++ b/test_conformance/buffers/test_buffer_read.cpp
@@ -763,7 +763,6 @@
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
-    clEventWrapper event;
     void        *outptr[5];
     void        *inptr[5];
     size_t      global_work_size[3];
@@ -805,6 +804,7 @@
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
             clMemWrapper buffer;
+            clEventWrapper event;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -900,7 +900,6 @@
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
-    clEventWrapper event;
     void        *outptr[5], *inptr[5];
     size_t      global_work_size[3];
     cl_int      err;
@@ -941,6 +940,7 @@
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
             clMemWrapper buffer;
+            clEventWrapper event;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
diff --git a/test_conformance/buffers/test_image_migrate.cpp b/test_conformance/buffers/test_image_migrate.cpp
index dbdca9c..6c8acdc 100644
--- a/test_conformance/buffers/test_image_migrate.cpp
+++ b/test_conformance/buffers/test_image_migrate.cpp
@@ -128,7 +128,6 @@
   cl_mem_migration_flags *flagsA, *flagsB, *flagsC;
   cl_device_partition_property property[] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0, 0};
   cl_mem *imageA, *imageB, *imageC;
-  cl_mem_flags flags;
   cl_image_format format;
   cl_sampler sampler = NULL;
   cl_program program = NULL;
diff --git a/test_conformance/buffers/test_sub_buffers.cpp b/test_conformance/buffers/test_sub_buffers.cpp
index 3e50121..d6ab111 100644
--- a/test_conformance/buffers/test_sub_buffers.cpp
+++ b/test_conformance/buffers/test_sub_buffers.cpp
@@ -15,6 +15,8 @@
 #include "procs.h"
+#include <algorithm>
 // Design:
 // To test sub buffers, we first create one main buffer. We then create several sub-buffers and
 // queue Actions on each one. Each Action is encapsulated in a class so it can keep track of
@@ -39,7 +41,8 @@
         region.size = mSize;
         cl_int error;
-        mMem = clCreateSubBuffer( mParentBuffer, flags, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+        reset(clCreateSubBuffer(mParentBuffer, flags,
+                                CL_BUFFER_CREATE_TYPE_REGION, &region, &error));
         return error;
@@ -100,13 +103,6 @@
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
 class CopyAction : public Action
@@ -116,7 +112,8 @@
     virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
         // Copy from sub-buffer 1 to sub-buffer 2
-        size_t size = get_random_size_t( 0, MIN( buffer1.mSize, buffer2.mSize ), GetRandSeed() );
+        size_t size = get_random_size_t(
+            0, std::min(buffer1.mSize, buffer2.mSize), GetRandSeed());
         size_t startOffset = get_random_size_t( 0, buffer1.mSize - size, GetRandSeed() );
         size_t endOffset = get_random_size_t( 0, buffer2.mSize - size, GetRandSeed() );
@@ -265,7 +262,11 @@
             endRange = mainSize;
         size_t offset = get_random_size_t( toStartFrom / addressAlign, endRange / addressAlign, Action::GetRandSeed() ) * addressAlign;
-        size_t size = get_random_size_t( 1, ( MIN( mainSize / 8, mainSize - offset ) ) / addressAlign, Action::GetRandSeed() ) * addressAlign;
+        size_t size =
+            get_random_size_t(
+                1, (std::min(mainSize / 8, mainSize - offset)) / addressAlign,
+                Action::GetRandSeed())
+            * addressAlign;
         error = subBuffers[ numSubBuffers ].Allocate( mainBuffer, CL_MEM_READ_WRITE, offset, size );
         test_error( error, "Unable to allocate sub buffer" );
@@ -442,7 +443,7 @@
     error = get_reasonable_buffer_size( otherDevice, maxBuffer2 );
     test_error( error, "Unable to get buffer size for secondary device" );
-    maxBuffer1 = MIN( maxBuffer1, maxBuffer2 );
+    maxBuffer1 = std::min(maxBuffer1, maxBuffer2);
     cl_uint addressAlign1Bits, addressAlign2Bits;
     error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign1Bits ), &addressAlign1Bits, NULL );
@@ -451,7 +452,7 @@
     error = clGetDeviceInfo( otherDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign2Bits ), &addressAlign2Bits, NULL );
     test_error( error, "Unable to get secondary device's address alignment" );
-    cl_uint addressAlign1 = MAX( addressAlign1Bits, addressAlign2Bits ) / 8;
+    cl_uint addressAlign1 = std::max(addressAlign1Bits, addressAlign2Bits) / 8;
     // Finally time to run!
     return test_sub_buffers_read_write_core( testingContext, queue1, queue2, maxBuffer1, addressAlign1 );
diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index bbcc68c..6c7d0b1 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -28,10 +28,9 @@
 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
 #define MAX_HOST_THREADS GetThreadCount()
-#define EXECUTE_TEST(error, test)\
-  error |= test;\
-  if(error && !gContinueOnError)\
-  return error;
+#define EXECUTE_TEST(error, test)                                              \
+    error |= test;                                                             \
+    if (error && !gContinueOnError) return error;
 enum TExplicitAtomicType
@@ -57,764 +56,918 @@
-extern bool gHost; // temporary flag for testing native host threads (test verification)
+extern bool
+    gHost; // temporary flag for testing native host threads (test verification)
 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
 extern bool gContinueOnError; // execute all cases even when errors detected
-extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
+extern bool
+    gNoGlobalVariables; // disable cases with global atomics in program scope
 extern bool gNoGenericAddressSpace; // disable cases with generic address space
 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
 extern bool gDebug; // print OpenCL kernel code
-extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
-extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
+extern int gInternalIterations; // internal test iterations for atomic
+                                // operation, sufficient to verify atomicity
+extern int
+    gMaxDeviceThreads; // maximum number of threads executed on OCL device
 extern cl_device_atomic_capabilities gAtomicMemCap,
     gAtomicFenceCap; // atomic memory and fence capabilities for this device
-extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
-extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
+extern const char *
+get_memory_order_type_name(TExplicitMemoryOrderType orderType);
+extern const char *
+get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
 extern cl_int getSupportedMemoryOrdersAndScopes(
     cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
     std::vector<TExplicitMemoryScopeType> &memoryScopes);
-class AtomicTypeInfo
+class AtomicTypeInfo {
-  TExplicitAtomicType _type;
-  AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
-  cl_uint Size(cl_device_id device);
-  const char* AtomicTypeName();
-  const char* RegularTypeName();
-  const char* AddSubOperandTypeName();
-  int IsSupported(cl_device_id device);
+    TExplicitAtomicType _type;
+    AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
+    cl_uint Size(cl_device_id device);
+    const char *AtomicTypeName();
+    const char *RegularTypeName();
+    const char *AddSubOperandTypeName();
+    int IsSupported(cl_device_id device);
-template<typename HostDataType>
-class AtomicTypeExtendedInfo : public AtomicTypeInfo
+template <typename HostDataType>
+class AtomicTypeExtendedInfo : public AtomicTypeInfo {
-  AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
-  HostDataType MinValue();
-  HostDataType MaxValue();
-  HostDataType SpecialValue(cl_uchar x)
-  {
-    HostDataType tmp;
-    cl_uchar *ptr = (cl_uchar*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
-      ptr[i] = x;
-    return tmp;
-  }
-  HostDataType SpecialValue(cl_ushort x)
-  {
-    HostDataType tmp;
-    cl_ushort *ptr = (cl_ushort*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
-      ptr[i] = x;
-    return tmp;
-  }
+    AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {}
+    HostDataType MinValue();
+    HostDataType MaxValue();
+    HostDataType SpecialValue(cl_uchar x)
+    {
+        HostDataType tmp;
+        cl_uchar *ptr = (cl_uchar *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++)
+            ptr[i] = x;
+        return tmp;
+    }
+    HostDataType SpecialValue(cl_ushort x)
+    {
+        HostDataType tmp;
+        cl_ushort *ptr = (cl_ushort *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++)
+            ptr[i] = x;
+        return tmp;
+    }
-class CTest  {
+class CTest {
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements) = 0;
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTest : CTest
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTest : CTest {
-  typedef struct {
-    CBasicTest *test;
-    cl_uint tid;
-    cl_uint threadCount;
-    volatile HostAtomicType *destMemory;
-    HostDataType *oldValues;
-  } THostThreadContext;
-  static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
-  {
-    THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
-    threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
-    return 0;
-  }
-  CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
-    _maxDeviceThreads(MAX_DEVICE_THREADS),
-    _dataType(dataType), _useSVM(useSVM), _startValue(255),
-    _localMemory(false), _declaredInProgram(false),
-    _usedInFunction(false), _genericAddrSpace(false),
-    _oldValueCheck(true), _localRefValues(false),
-    _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
-  {
-  }
-  virtual ~CBasicTest()
-  {
-    if(_passCount)
-      log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return 1;
-  }
-  virtual cl_uint NumNonAtomicVariablesPerThread()
-  {
-    return 1;
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    return false;
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    return false;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    return false;
-  }
-  virtual std::string PragmaHeader(cl_device_id deviceID);
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems);
-  virtual std::string FunctionCode();
-  virtual std::string KernelCode(cl_uint maxNumDestItems);
-  virtual std::string ProgramCore() = 0;
-  virtual std::string SingleTestName()
-  {
-    std::string testName = LocalMemory() ? "local" : "global";
-    testName += " ";
-    testName += DataType().AtomicTypeName();
-    if(DeclaredInProgram())
+    typedef struct
-      testName += " declared in program";
-    }
-    if(DeclaredInProgram() && UsedInFunction())
-      testName += ",";
-    if(UsedInFunction())
+        CBasicTest *test;
+        cl_uint tid;
+        cl_uint threadCount;
+        volatile HostAtomicType *destMemory;
+        HostDataType *oldValues;
+    } THostThreadContext;
+    static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id,
+                                     void *userInfo)
-      testName += " used in ";
-      if(GenericAddrSpace())
-        testName += "generic ";
-      testName += "function";
-    }
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
-  int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    UsedInFunction(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    UsedInFunction(true);
-    GenericAddrSpace(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(true);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(false);
-    return error;
-  }
-  int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    DeclaredInProgram(false);
-    EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    if(!UseSVM())
-    {
-      DeclaredInProgram(true);
-      EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    if(_maxDeviceThreads > 0 && !UseSVM())
-    {
-      LocalMemory(true);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    if(_maxDeviceThreads+MaxHostThreads() > 0)
-    {
-      LocalMemory(false);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  {
-    if(sizeof(HostAtomicType) != DataType().Size(deviceID))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
-      return -1;
-    }
-    if(sizeof(HostAtomicType) != sizeof(HostDataType))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
-      return -1;
-    }
-    // Verify we can run first
-    if(UseSVM() && !gUseHostPtr)
-    {
-      cl_device_svm_capabilities caps;
-      cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
-      test_error(error, "clGetDeviceInfo failed");
-      if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
-      {
-        log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
-        // implicit pass
+        THostThreadContext *threadContext =
+            ((THostThreadContext *)userInfo) + job_id;
+        threadContext->test->HostFunction(
+            threadContext->tid, threadContext->threadCount,
+            threadContext->destMemory, threadContext->oldValues);
         return 0;
-      }
-    if(!DataType().IsSupported(deviceID))
+    CBasicTest(TExplicitAtomicType dataType, bool useSVM)
+        : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
+          _useSVM(useSVM), _startValue(255), _localMemory(false),
+          _declaredInProgram(false), _usedInFunction(false),
+          _genericAddrSpace(false), _oldValueCheck(true),
+          _localRefValues(false), _maxGroupSize(0), _passCount(0),
+          _iterations(gInternalIterations)
+    {}
+    virtual ~CBasicTest()
-      log_info("\t%s not supported\n", DataType().AtomicTypeName());
-      // implicit pass or host test (debug feature)
-      if(UseSVM())
-        return 0;
-      _maxDeviceThreads = 0;
+        if (_passCount)
+            log_info("  %u tests executed successfully for %s\n", _passCount,
+                     DataType().AtomicTypeName());
-    if(_maxDeviceThreads+MaxHostThreads() == 0)
-      return 0;
-    return ExecuteForEachParameterSet(deviceID, context, queue);
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    log_info("Empty thread function %u\n", (cl_uint)tid);
-  }
-  AtomicTypeExtendedInfo<HostDataType> DataType() const
-  {
-    return AtomicTypeExtendedInfo<HostDataType>(_dataType);
-  }
-  cl_uint _maxDeviceThreads;
-  virtual cl_uint MaxHostThreads()
-  {
-    if(UseSVM() || gHost)
-      return MAX_HOST_THREADS;
-    else
-      return 0;
-  }
-  int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
-                        TExplicitMemoryOrderType memoryOrder)
-  {
-      /*
-          Differentiation between atomic fence and other atomic operations
-          does not need to occur here.
-          The initialisation of this test checks that the minimum required
-          capabilities are supported by this device.
-          The following switches allow the test to skip if optional capabilites
-          are not supported by the device.
-        */
-      switch (memoryScope)
-      {
-          case MEMORY_SCOPE_EMPTY: {
-              break;
-          }
-          case MEMORY_SCOPE_WORK_GROUP: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_DEVICE: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_ALL_DEVICES: // fallthough
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory scope\n");
-              break;
-          }
-      }
-      switch (memoryOrder)
-      {
-          case MEMORY_ORDER_EMPTY: {
-              break;
-          }
-          case MEMORY_ORDER_RELAXED: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_ACQUIRE:
-          case MEMORY_ORDER_RELEASE:
-          case MEMORY_ORDER_ACQ_REL: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_SEQ_CST: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory order\n");
-              break;
-          }
-      }
-      return 0;
-  }
-  virtual bool SVMDataBufferAllSVMConsistent() {return false;}
-  bool UseSVM() {return _useSVM;}
-  void StartValue(HostDataType startValue) {_startValue = startValue;}
-  HostDataType StartValue() {return _startValue;}
-  void LocalMemory(bool local) {_localMemory = local;}
-  bool LocalMemory() {return _localMemory;}
-  void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
-  bool DeclaredInProgram() {return _declaredInProgram;}
-  void UsedInFunction(bool local) {_usedInFunction = local;}
-  bool UsedInFunction() {return _usedInFunction;}
-  void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
-  bool GenericAddrSpace() {return _genericAddrSpace;}
-  void OldValueCheck(bool check) {_oldValueCheck = check;}
-  bool OldValueCheck() {return _oldValueCheck;}
-  void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
-  bool LocalRefValues() {return _localRefValues;}
-  void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
-  cl_uint MaxGroupSize() {return _maxGroupSize;}
-  void CurrentGroupSize(cl_uint currentGroupSize)
-  {
-    if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
-      _currentGroupSize = MaxGroupSize();
-    else
-      _currentGroupSize = currentGroupSize;
-  }
-  cl_uint CurrentGroupSize() {return _currentGroupSize;}
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(threadCount == 0)
-      return 0;
-    if(LocalMemory())
-      return 1;
-    return threadCount/CurrentGroupSize();
-  }
-  cl_int Iterations() {return _iterations;}
-  std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
-  const TExplicitAtomicType _dataType;
-  const bool _useSVM;
-  HostDataType	_startValue;
-  bool _localMemory;
-  bool _declaredInProgram;
-  bool _usedInFunction;
-  bool _genericAddrSpace;
-  bool _oldValueCheck;
-  bool _localRefValues;
-  cl_uint _maxGroupSize;
-  cl_uint _currentGroupSize;
-  cl_uint _passCount;
-  const cl_int _iterations;
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
-  using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
-  {
-    std::string header;
-    if(gOldAPI)
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-      std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
-      header +=
-        "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
-        "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
-        "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
-        "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
-        "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
-        "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
-        "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
-        "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
-        "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
-        "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
-        "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
-        "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
-        "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
-        "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
+        return 1;
-    return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
+    virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
+        return false;
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
+        return false;
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(LocalMemory() &&
-      MemoryScope() != MEMORY_SCOPE_EMPTY &&
-      MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
-      return 0;
-    if(MemoryScope() == MEMORY_SCOPE_DEVICE)
-      MaxGroupSize(16); // increase number of groups by forcing smaller group size
-    else
-      MaxGroupSize(0); // group size limited by device capabilities
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-    return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
-      for(unsigned si = 0; si < memoryScope.size(); si++)
-      {
-        if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-          continue;
-        MemoryOrder(memoryOrder[oi]);
-        MemoryScope(memoryScope[si]);
-        EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
-      }
+        return false;
-    return error;
-  }
-  void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
-  TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
-  std::string MemoryOrderStr()
-  {
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder());
-    return "";
-  }
-  void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
-  TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
-  std::string MemoryScopeStr()
-  {
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      return std::string(", ")+get_memory_scope_type_name(MemoryScope());
-    return "";
-  }
-  std::string MemoryOrderScopeStr()
-  {
-    return MemoryOrderStr()+MemoryScopeStr();
-  }
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
-      return 1;
-    return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
-  }
-  virtual cl_uint MaxHostThreads()
-  {
-      // block host threads execution for memory scope different than
-      // memory_scope_all_svm_devices
-      if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
-          || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
-      {
-          return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
-      }
-      else
-      {
-          return 0;
-      }
-  }
-  TExplicitMemoryOrderType _memoryOrder;
-  TExplicitMemoryScopeType _memoryScope;
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
-    return testName;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
+    virtual std::string PragmaHeader(cl_device_id deviceID);
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems);
+    virtual std::string FunctionCode();
+    virtual std::string KernelCode(cl_uint maxNumDestItems);
+    virtual std::string ProgramCore() = 0;
+    virtual std::string SingleTestName()
-      for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
-      {
-        for(unsigned si = 0; si < memoryScope.size(); si++)
+        std::string testName = LocalMemory() ? "local" : "global";
+        testName += " ";
+        testName += DataType().AtomicTypeName();
+        if (DeclaredInProgram())
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryOrder[oi] != memoryOrder[o2i])
-            continue; // both memory order arguments must be set (or none)
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-            continue; // memory scope without memory order is not allowed
-          MemoryOrder(memoryOrder[oi]);
-          MemoryOrder2(memoryOrder[o2i]);
-          MemoryScope(memoryScope[si]);
-          if (CheckCapabilities(MemoryScope(), MemoryOrder())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-          if (CheckCapabilities(MemoryScope(), MemoryOrder2())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-          EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
+            testName += " declared in program";
-      }
+        if (DeclaredInProgram() && UsedInFunction()) testName += ",";
+        if (UsedInFunction())
+        {
+            testName += " used in ";
+            if (GenericAddrSpace()) testName += "generic ";
+            testName += "function";
+        }
+        return testName;
-    return error;
-  }
-  void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
-  TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
-  std::string MemoryOrderFailStr()
-  {
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
-    return "";
-  }
-  std::string MemoryOrderScope()
-  {
-    return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
-  }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue);
+    int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        int error = 0;
+        UsedInFunction(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        UsedInFunction(true);
+        GenericAddrSpace(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(true);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(false);
+        return error;
+    }
+    int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue)
+    {
+        int error = 0;
+        DeclaredInProgram(false);
+        EXECUTE_TEST(error,
+                     ExecuteForEachPointerType(deviceID, context, queue));
+        if (!UseSVM())
+        {
+            DeclaredInProgram(true);
+            EXECUTE_TEST(error,
+                         ExecuteForEachPointerType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        int error = 0;
+        if (_maxDeviceThreads > 0 && !UseSVM())
+        {
+            LocalMemory(true);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        if (_maxDeviceThreads + MaxHostThreads() > 0)
+        {
+            LocalMemory(false);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    {
+        if (sizeof(HostAtomicType) != DataType().Size(deviceID))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than OpenCL type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     DataType().Size(deviceID));
+            return -1;
+        }
+        if (sizeof(HostAtomicType) != sizeof(HostDataType))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than corresponding type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     (cl_uint)sizeof(HostDataType));
+            return -1;
+        }
+        // Verify we can run first
+        if (UseSVM() && !gUseHostPtr)
+        {
+            cl_device_svm_capabilities caps;
+            cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES,
+                                           sizeof(caps), &caps, 0);
+            test_error(error, "clGetDeviceInfo failed");
+            if ((caps & CL_DEVICE_SVM_ATOMICS) == 0)
+            {
+                log_info("\t%s - SVM_ATOMICS not supported\n",
+                         DataType().AtomicTypeName());
+                // implicit pass
+                return 0;
+            }
+        }
+        if (!DataType().IsSupported(deviceID))
+        {
+            log_info("\t%s not supported\n", DataType().AtomicTypeName());
+            // implicit pass or host test (debug feature)
+            if (UseSVM()) return 0;
+            _maxDeviceThreads = 0;
+        }
+        if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
+        return ExecuteForEachParameterSet(deviceID, context, queue);
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        log_info("Empty thread function %u\n", (cl_uint)tid);
+    }
+    AtomicTypeExtendedInfo<HostDataType> DataType() const
+    {
+        return AtomicTypeExtendedInfo<HostDataType>(_dataType);
+    }
+    cl_uint _maxDeviceThreads;
+    virtual cl_uint MaxHostThreads()
+    {
+        if (UseSVM() || gHost)
+            return MAX_HOST_THREADS;
+        else
+            return 0;
+    }
+    int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
+                          TExplicitMemoryOrderType memoryOrder)
+    {
+        /*
+            Differentiation between atomic fence and other atomic operations
+            does not need to occur here.
+            The initialisation of this test checks that the minimum required
+            capabilities are supported by this device.
+            The following switches allow the test to skip if optional
+           capabilites are not supported by the device.
+          */
+        switch (memoryScope)
+        {
+            case MEMORY_SCOPE_EMPTY: {
+                break;
+            }
+            case MEMORY_SCOPE_WORK_GROUP: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_DEVICE: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_ALL_DEVICES: // fallthough
+            case MEMORY_SCOPE_ALL_SVM_DEVICES: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory scope\n");
+                break;
+            }
+        }
+        switch (memoryOrder)
+        {
+            case MEMORY_ORDER_EMPTY: {
+                break;
+            }
+            case MEMORY_ORDER_RELAXED: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_ACQUIRE:
+            case MEMORY_ORDER_RELEASE:
+            case MEMORY_ORDER_ACQ_REL: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_SEQ_CST: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory order\n");
+                break;
+            }
+        }
+        return 0;
+    }
+    virtual bool SVMDataBufferAllSVMConsistent() { return false; }
+    bool UseSVM() { return _useSVM; }
+    void StartValue(HostDataType startValue) { _startValue = startValue; }
+    HostDataType StartValue() { return _startValue; }
+    void LocalMemory(bool local) { _localMemory = local; }
+    bool LocalMemory() { return _localMemory; }
+    void DeclaredInProgram(bool declaredInProgram)
+    {
+        _declaredInProgram = declaredInProgram;
+    }
+    bool DeclaredInProgram() { return _declaredInProgram; }
+    void UsedInFunction(bool local) { _usedInFunction = local; }
+    bool UsedInFunction() { return _usedInFunction; }
+    void GenericAddrSpace(bool genericAddrSpace)
+    {
+        _genericAddrSpace = genericAddrSpace;
+    }
+    bool GenericAddrSpace() { return _genericAddrSpace; }
+    void OldValueCheck(bool check) { _oldValueCheck = check; }
+    bool OldValueCheck() { return _oldValueCheck; }
+    void LocalRefValues(bool localRefValues)
+    {
+        _localRefValues = localRefValues;
+    }
+    bool LocalRefValues() { return _localRefValues; }
+    void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; }
+    cl_uint MaxGroupSize() { return _maxGroupSize; }
+    void CurrentGroupSize(cl_uint currentGroupSize)
+    {
+        if (MaxGroupSize() && MaxGroupSize() < currentGroupSize)
+            _currentGroupSize = MaxGroupSize();
+        else
+            _currentGroupSize = currentGroupSize;
+    }
+    cl_uint CurrentGroupSize() { return _currentGroupSize; }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (threadCount == 0) return 0;
+        if (LocalMemory()) return 1;
+        return threadCount / CurrentGroupSize();
+    }
+    cl_int Iterations() { return _iterations; }
+    std::string IterationsStr()
+    {
+        std::stringstream ss;
+        ss << _iterations;
+        return ss.str();
+    }
-  TExplicitMemoryOrderType _memoryOrder2;
+    const TExplicitAtomicType _dataType;
+    const bool _useSVM;
+    HostDataType _startValue;
+    bool _localMemory;
+    bool _declaredInProgram;
+    bool _usedInFunction;
+    bool _genericAddrSpace;
+    bool _oldValueCheck;
+    bool _localRefValues;
+    cl_uint _maxGroupSize;
+    cl_uint _currentGroupSize;
+    cl_uint _passCount;
+    const cl_int _iterations;
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
-  std::string pragma;
-  if(gOldAPI)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
-  }
-  // Create the pragma lines for this kernel
-  if(DataType().Size(deviceID) == 8)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
-  }
-  if(_dataType == TYPE_ATOMIC_DOUBLE)
-    pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-  return pragma;
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
-  // Create the program header
-  std::string header;
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string argListForKernel;
-  std::string argListForFunction;
-  std::string argListNoTypes;
-  std::string functionPrototype;
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  if(gOldAPI)
-  {
-    header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
-      "#define atomic_store(x,y)                                (*(x) = y)\n"
-      "#define atomic_load(x)                                   (*(x))\n"
-      "#define ATOMIC_VAR_INIT(x)                               (x)\n"
-      "#define ATOMIC_FLAG_INIT                                 0\n"
-      "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
-    if(aTypeName == "atomic_float")
-      header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
-    else if(aTypeName == "atomic_double")
-      header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
-        "{\n"
-        "  long tmp = *(long*)&y, res;\n"
-        "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
-        "  res = atom_xchg(tmpA,tmp);\n"
-        "  return *(double*)&res;\n"
-        "}\n";
-    else
-      header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
-    if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
-      header +=
-      "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
-      "{\n"
-      "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
-      "  if(old == *expected)\n"
-      "    return true;\n"
-      "  *expected = old;\n"
-      "  return false;\n"
-      "}\n"
-      "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
-    header +=
-      "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
-      "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
-      "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
-      "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
-      "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
-      "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
-      "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
-      "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
-      "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
-      "\n";
-  }
-  if(!LocalMemory() && DeclaredInProgram())
-  {
-    // additional atomic variable for results copying (last thread will do this)
-    header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
-    // atomic variables declared in program scope - test data
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
-    ss.str("");
-    ss << _startValue;
-    for(cl_uint i = 0; i < maxNumDestItems; i++)
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrderScope
+    : public CBasicTest<HostAtomicType, HostDataType> {
+    using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
+    {}
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems)
-      if(aTypeName == "atomic_flag")
-        header +=  "  ATOMIC_FLAG_INIT";
-      else
-        header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
-      if(i+1 < maxNumDestItems)
-        header += ",";
-      header += "\n";
+        std::string header;
+        if (gOldAPI)
+        {
+            std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
+            header += "#define atomic_store_explicit(x,y,o" + s
+                + ")                     atomic_store(x,y)\n"
+                  "#define atomic_load_explicit(x,o"
+                + s
+                + ")                        atomic_load(x)\n"
+                  "#define atomic_exchange_explicit(x,y,o"
+                + s
+                + ")                  atomic_exchange(x,y)\n"
+                  "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"
+                + s
+                + ") atomic_compare_exchange_strong(x,y,z)\n"
+                  "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"
+                + s
+                + ")   atomic_compare_exchange_weak(x,y,z)\n"
+                  "#define atomic_fetch_add_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_add(x,y)\n"
+                  "#define atomic_fetch_sub_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_sub(x,y)\n"
+                  "#define atomic_fetch_or_explicit(x,y,o"
+                + s
+                + ")                  atomic_fetch_or(x,y)\n"
+                  "#define atomic_fetch_xor_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_xor(x,y)\n"
+                  "#define atomic_fetch_and_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_and(x,y)\n"
+                  "#define atomic_fetch_min_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_min(x,y)\n"
+                  "#define atomic_fetch_max_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_max(x,y)\n"
+                  "#define atomic_flag_test_and_set_explicit(x,o"
+                + s
+                + ")           atomic_flag_test_and_set(x)\n"
+                  "#define atomic_flag_clear_explicit(x,o"
+                + s + ")                  atomic_flag_clear(x)\n";
+        }
+        return header
+            + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(
+                   maxNumDestItems);
-    header+=
-      "};\n"
-      "\n";
-  }
-  return header;
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        }
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        }
+        return testName;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY
+            && MemoryScope()
+                != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used
+                                            // for global memory
+            return 0;
+        if (MemoryScope() == MEMORY_SCOPE_DEVICE)
+            MaxGroupSize(
+                16); // increase number of groups by forcing smaller group size
+        else
+            MaxGroupSize(0); // group size limited by device capabilities
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+        return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+            deviceID, context, queue);
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
+        {
+            for (unsigned si = 0; si < memoryScope.size(); si++)
+            {
+                if (memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                    && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                    continue;
+                MemoryOrder(memoryOrder[oi]);
+                MemoryScope(memoryScope[si]);
+                EXECUTE_TEST(
+                    error,
+                    (CBasicTest<HostAtomicType, HostDataType>::
+                         ExecuteForEachParameterSet(deviceID, context, queue)));
+            }
+        }
+        return error;
+    }
+    void MemoryOrder(TExplicitMemoryOrderType memoryOrder)
+    {
+        _memoryOrder = memoryOrder;
+    }
+    TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; }
+    std::string MemoryOrderStr()
+    {
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder());
+        return "";
+    }
+    void MemoryScope(TExplicitMemoryScopeType memoryScope)
+    {
+        _memoryScope = memoryScope;
+    }
+    TExplicitMemoryScopeType MemoryScope() { return _memoryScope; }
+    std::string MemoryScopeStr()
+    {
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            return std::string(", ")
+                + get_memory_scope_type_name(MemoryScope());
+        return "";
+    }
+    std::string MemoryOrderScopeStr()
+    {
+        return MemoryOrderStr() + MemoryScopeStr();
+    }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1;
+        return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(
+            threadCount);
+    }
+    virtual cl_uint MaxHostThreads()
+    {
+        // block host threads execution for memory scope different than
+        // memory_scope_all_svm_devices
+        if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
+            || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
+        {
+            return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
+        }
+        else
+        {
+            return 0;
+        }
+    }
+    TExplicitMemoryOrderType _memoryOrder;
+    TExplicitMemoryScopeType _memoryScope;
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrder2Scope
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder2()))
+                      .substr(sizeof("memory"));
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        return testName;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
+        {
+            for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
+            {
+                for (unsigned si = 0; si < memoryScope.size(); si++)
+                {
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryOrder[oi] != memoryOrder[o2i])
+                        continue; // both memory order arguments must be set (or
+                                  // none)
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                        continue; // memory scope without memory order is not
+                                  // allowed
+                    MemoryOrder(memoryOrder[oi]);
+                    MemoryOrder2(memoryOrder[o2i]);
+                    MemoryScope(memoryScope[si]);
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder2())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+                    EXECUTE_TEST(error,
+                                 (CBasicTest<HostAtomicType, HostDataType>::
+                                      ExecuteForEachParameterSet(
+                                          deviceID, context, queue)));
+                }
+            }
+        }
+        return error;
+    }
+    void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)
+    {
+        _memoryOrder2 = memoryOrderFail;
+    }
+    TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; }
+    std::string MemoryOrderFailStr()
+    {
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder2());
+        return "";
+    }
+    std::string MemoryOrderScope()
+    {
+        return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr();
+    }
+    TExplicitMemoryOrderType _memoryOrder2;
+template <typename HostAtomicType, typename HostDataType>
+CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
+    std::string pragma;
+    if (gOldAPI)
+    {
+        pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_local_int32_extended_atomics : enable\n";
+        pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_global_int32_extended_atomics : enable\n";
+    }
+    // Create the pragma lines for this kernel
+    if (DataType().Size(deviceID) == 8)
+    {
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
+    }
+    if (_dataType == TYPE_ATOMIC_DOUBLE)
+        pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    return pragma;
-template<typename HostAtomicType, typename HostDataType>
+template <typename HostAtomicType, typename HostDataType>
+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
+    // Create the program header
+    std::string header;
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string argListForKernel;
+    std::string argListForFunction;
+    std::string argListNoTypes;
+    std::string functionPrototype;
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    if (gOldAPI)
+    {
+        header += std::string("#define ") + aTypeName + " " + cTypeName
+            + "\n"
+              "#define atomic_store(x,y)                                (*(x) "
+              "= y)\n"
+              "#define atomic_load(x)                                   "
+              "(*(x))\n"
+              "#define ATOMIC_VAR_INIT(x)                               (x)\n"
+              "#define ATOMIC_FLAG_INIT                                 0\n"
+              "#define atomic_init(x,y)                                 "
+              "atomic_store(x,y)\n";
+        if (aTypeName == "atomic_float")
+            header += "#define atomic_exchange(x,y)                            "
+                      " atomic_xchg(x,y)\n";
+        else if (aTypeName == "atomic_double")
+            header += "double atomic_exchange(volatile " + addressSpace
+                + "atomic_double *x, double y)\n"
+                  "{\n"
+                  "  long tmp = *(long*)&y, res;\n"
+                  "  volatile "
+                + addressSpace + "long *tmpA = (volatile " + addressSpace
+                + "long)x;\n"
+                  "  res = atom_xchg(tmpA,tmp);\n"
+                  "  return *(double*)&res;\n"
+                  "}\n";
+        else
+            header += "#define atomic_exchange(x,y)                            "
+                      " atom_xchg(x,y)\n";
+        if (aTypeName != "atomic_float" && aTypeName != "atomic_double")
+            header += "bool atomic_compare_exchange_strong(volatile "
+                + addressSpace + " " + aTypeName + " *a, " + cTypeName
+                + " *expected, " + cTypeName
+                + " desired)\n"
+                  "{\n"
+                  "  "
+                + cTypeName
+                + " old = atom_cmpxchg(a, *expected, desired);\n"
+                  "  if(old == *expected)\n"
+                  "    return true;\n"
+                  "  *expected = old;\n"
+                  "  return false;\n"
+                  "}\n"
+                  "#define atomic_compare_exchange_weak                     "
+                  "atomic_compare_exchange_strong\n";
+        header += "#define atomic_fetch_add(x,y)                            "
+                  "atom_add(x,y)\n"
+                  "#define atomic_fetch_sub(x,y)                            "
+                  "atom_sub(x,y)\n"
+                  "#define atomic_fetch_or(x,y)                             "
+                  "atom_or(x,y)\n"
+                  "#define atomic_fetch_xor(x,y)                            "
+                  "atom_xor(x,y)\n"
+                  "#define atomic_fetch_and(x,y)                            "
+                  "atom_and(x,y)\n"
+                  "#define atomic_fetch_min(x,y)                            "
+                  "atom_min(x,y)\n"
+                  "#define atomic_fetch_max(x,y)                            "
+                  "atom_max(x,y)\n"
+                  "#define atomic_flag_test_and_set(x)                      "
+                  "atomic_exchange(x,1)\n"
+                  "#define atomic_flag_clear(x)                             "
+                  "atomic_store(x,0)\n"
+                  "\n";
+    }
+    if (!LocalMemory() && DeclaredInProgram())
+    {
+        // additional atomic variable for results copying (last thread will do
+        // this)
+        header += "__global volatile atomic_uint finishedThreads = "
+                  "ATOMIC_VAR_INIT(0);\n";
+        // atomic variables declared in program scope - test data
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        header += std::string("__global volatile ") + aTypeName + " destMemory["
+            + ss.str() + "] = {\n";
+        ss.str("");
+        ss << _startValue;
+        for (cl_uint i = 0; i < maxNumDestItems; i++)
+        {
+            if (aTypeName == "atomic_flag")
+                header += "  ATOMIC_FLAG_INIT";
+            else
+                header += "  ATOMIC_VAR_INIT(" + ss.str() + ")";
+            if (i + 1 < maxNumDestItems) header += ",";
+            header += "\n";
+        }
+        header += "};\n"
+                  "\n";
+    }
+    return header;
+template <typename HostAtomicType, typename HostDataType>
 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
-  if(!UsedInFunction())
-    return "";
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
-  if(!GenericAddrSpace())
-    code += addressSpace;
-  code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
-    " *oldValues";
-  if(LocalRefValues())
-    code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
-  code += ")\n"
-    "{\n";
-  code += ProgramCore();
-  code += "}\n"
-    "\n";
-  return code;
+    if (!UsedInFunction()) return "";
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "void test_atomic_function(uint tid, uint threadCount, "
+                       "uint numDestItems, volatile ";
+    if (!GenericAddrSpace()) code += addressSpace;
+    code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global "
+        + DataType().RegularTypeName() + " *oldValues";
+    if (LocalRefValues())
+        code += std::string(", __local ") + DataType().RegularTypeName()
+            + " *localValues";
+    code += ")\n"
+            "{\n";
+    code += ProgramCore();
+    code += "}\n"
+            "\n";
+    return code;
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
+template <typename HostAtomicType, typename HostDataType>
+CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "__kernel void test_atomic_kernel(uint threadCount, "
+                       "uint numDestItems, ";
-  // prepare list of arguments for kernel
-  if(LocalMemory())
-  {
-    code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
-      " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
-  }
-  else
-  {
-    code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
-      ", __global "+cTypeName+" *oldValues";
-  }
-  if(LocalRefValues())
-    code += std::string(", __local ")+cTypeName+" *localValues";
-  code += ")\n"
-    "{\n";
-  if(LocalMemory() && DeclaredInProgram())
-  {
-    // local atomics declared in kernel scope
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
-  }
-  code += "  uint  tid = get_global_id(0);\n"
-    "\n";
-  if(LocalMemory())
-  {
-      // memory_order_relaxed is sufficient for these initialization operations
-      // as the barrier below will act as a fence, providing an order to the
-      // operations. memory_scope_work_group is sufficient as local memory is
-      // only visible within the work-group.
-      code += R"(
+    // prepare list of arguments for kernel
+    if (LocalMemory())
+    {
+        code += std::string("__global ") + cTypeName + " *finalDest, __global "
+            + cTypeName
+            + " *oldValues,"
+              " volatile "
+            + addressSpace + aTypeName + " *"
+            + (DeclaredInProgram() ? "notUsed" : "") + "destMemory";
+    }
+    else
+    {
+        code += "volatile " + addressSpace
+            + (DeclaredInProgram() ? (cTypeName + " *finalDest")
+                                   : (aTypeName + " *destMemory"))
+            + ", __global " + cTypeName + " *oldValues";
+    }
+    if (LocalRefValues())
+        code += std::string(", __local ") + cTypeName + " *localValues";
+    code += ")\n"
+            "{\n";
+    if (LocalMemory() && DeclaredInProgram())
+    {
+        // local atomics declared in kernel scope
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        code += std::string("  __local volatile ") + aTypeName + " destMemory["
+            + ss.str() + "];\n";
+    }
+    code += "  uint  tid = get_global_id(0);\n"
+            "\n";
+    if (LocalMemory())
+    {
+        // memory_order_relaxed is sufficient for these initialization
+        // operations as the barrier below will act as a fence, providing an
+        // order to the operations. memory_scope_work_group is sufficient as
+        // local memory is only visible within the work-group.
+        code += R"(
               // initialize atomics not reachable from host (first thread
               // is doing this, other threads are waiting on barrier)
               if(get_local_id(0) == 0)
                 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
-      if (aTypeName == "atomic_flag")
-      {
-          code += R"(
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
@@ -823,512 +976,595 @@
-      }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
+        }
+        code += "    }\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
-    code +=
-      "    }\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy input reference values into local memory\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
-    else
+    if (LocalRefValues())
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
+        code += "  // Copy input reference values into local memory\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    localValues[get_local_id(0)*"
+                + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
+        }
+        code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
-    code +=
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (UsedInFunction())
-    code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
-    (LocalRefValues() ? ", localValues" : "")+");\n";
-  else
-    code += ProgramCore();
-  code += "\n";
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy local reference values into output array\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+    if (UsedInFunction())
+        code += std::string("  test_atomic_function(tid, threadCount, "
+                            "numDestItems, destMemory, oldValues")
+            + (LocalRefValues() ? ", localValues" : "") + ");\n";
-    {
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
-    }
+        code += ProgramCore();
     code += "\n";
-  }
-  if(LocalMemory() || DeclaredInProgram())
-  {
-    code += "  // Copy final values to host reachable buffer\n";
-    if(LocalMemory())
-      code +=
-        "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-        "  if(get_local_id(0) == 0) // first thread in workgroup\n";
-    else
-      // global atomics declared in program scope
-      code += R"(
-                if(atomic_fetch_add_explicit(&finishedThreads, 1u,
-                                           memory_order_relaxed,
-                                           memory_scope_work_group)
-                   == get_global_size(0)-1) // last finished thread
-                   )";
-    code +=
-        "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
-    if(aTypeName == "atomic_flag")
+    if (LocalRefValues())
-        code += R"(
+        code += "  // Copy local reference values into output array\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    oldValues[tid*"
+                + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str()
+                + "+rfId];\n";
+        }
+        code += "\n";
+    }
+    if (LocalMemory())
+    {
+        code += "  // Copy final values to host reachable buffer\n";
+        code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "  if(get_local_id(0) == 0) // first thread in workgroup\n";
+        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
+                "dstItemIdx++)\n";
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
-    }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
+        }
-  }
-  code += "}\n"
-    "\n";
-  return code;
+    else if (DeclaredInProgram())
+    {
+        // global atomics declared in program scope
+        code += "  // Copy final values to host reachable buffer\n";
+        code += R"(
+            if(atomic_fetch_add_explicit(&finishedThreads, 1u,
+                                         memory_order_acq_rel,
+                                         memory_scope_device)
+                   == get_global_size(0)-1) // last finished thread
+                )";
+        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
+                "dstItemIdx++)\n";
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
+                finalDest[dstItemIdx] =
+                    atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
+                                                      memory_order_relaxed,
+                                                      memory_scope_device);)";
+        }
+        else
+        {
+            code += R"(
+                finalDest[dstItemIdx] =
+                    atomic_load_explicit(destMemory+dstItemIdx,
+                                         memory_order_relaxed,
+                                         memory_scope_device);)";
+        }
+    }
+    code += "}\n"
+            "\n";
+    return code;
 template <typename HostAtomicType, typename HostDataType>
-int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
+int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  int error;
-  clProgramWrapper program;
-  clKernelWrapper kernel;
-  size_t threadNum[1];
-  clMemWrapper streams[2];
-  std::vector<HostAtomicType> destItems;
-  HostAtomicType *svmAtomicBuffer = 0;
-  std::vector<HostDataType> refValues, startRefValues;
-  HostDataType *svmDataBuffer = 0;
-  cl_uint deviceThreadCount, hostThreadCount, threadCount;
-  size_t groupSize = 0;
-  std::string programSource;
-  const char *programLine;
-  MTdata d;
-  size_t typeSize = DataType().Size(deviceID);
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    size_t threadNum[1];
+    clMemWrapper streams[2];
+    std::vector<HostAtomicType> destItems;
+    HostAtomicType *svmAtomicBuffer = 0;
+    std::vector<HostDataType> refValues, startRefValues;
+    HostDataType *svmDataBuffer = 0;
+    cl_uint deviceThreadCount, hostThreadCount, threadCount;
+    size_t groupSize = 0;
+    std::string programSource;
+    const char *programLine;
+    MTdata d;
+    size_t typeSize = DataType().Size(deviceID);
-  deviceThreadCount = _maxDeviceThreads;
-  hostThreadCount = MaxHostThreads();
-  threadCount = deviceThreadCount+hostThreadCount;
+    deviceThreadCount = _maxDeviceThreads;
+    hostThreadCount = MaxHostThreads();
+    threadCount = deviceThreadCount + hostThreadCount;
-  //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
-  log_info("\t%s...\n", SingleTestName().c_str());
+    // log_info("\t%s %s%s...\n", local ? "local" : "global",
+    // DataType().AtomicTypeName(), memoryOrderScope.c_str());
+    log_info("\t%s...\n", SingleTestName().c_str());
-  if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
-  if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
+    if (!LocalMemory() && DeclaredInProgram()
+        && gNoGlobalVariables) // no support for program scope global variables
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
+    if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
+    if (!LocalMemory() && DeclaredInProgram())
+    {
+        if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
+            || ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0))
+        {
+            log_info("\t\tTest disabled\n");
+            return 0;
+        }
+    }
-  // set up work sizes based on device capabilities and test configuration
-  error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
-  test_error(error, "Unable to obtain max work group size for device");
-  CurrentGroupSize((cl_uint)groupSize);
-  if(CurrentGroupSize() > deviceThreadCount)
-    CurrentGroupSize(deviceThreadCount);
-  if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-    deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-  threadCount = deviceThreadCount+hostThreadCount;
+    // set up work sizes based on device capabilities and test configuration
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(groupSize), &groupSize, NULL);
+    test_error(error, "Unable to obtain max work group size for device");
+    CurrentGroupSize((cl_uint)groupSize);
+    if (CurrentGroupSize() > deviceThreadCount)
+        CurrentGroupSize(deviceThreadCount);
+    if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+        deviceThreadCount =
+            CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+    threadCount = deviceThreadCount + hostThreadCount;
-  // If we're given a num_results function, we need to determine how many result objects we need.
-  // This is the first assessment for current maximum number of threads (exact thread count is not known here)
-  // - needed for program source code generation (arrays of atomics declared in program)
-  cl_uint numDestItems = NumResults(threadCount, deviceID);
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. This is the first assessment for current maximum
+    // number of threads (exact thread count is not known here)
+    // - needed for program source code generation (arrays of atomics declared
+    // in program)
+    cl_uint numDestItems = NumResults(threadCount, deviceID);
-  if(deviceThreadCount > 0)
-  {
-      // This loop iteratively reduces the workgroup size by 2 and then
-      // re-generates the kernel with the reduced
-      // workgroup size until we find a size which is admissible for the kernel
-      // being run or reduce the wg size
-      // to the trivial case of 1 (which was separately verified to be accurate
-      // for the kernel being run)
+    if (deviceThreadCount > 0)
+    {
+        // This loop iteratively reduces the workgroup size by 2 and then
+        // re-generates the kernel with the reduced
+        // workgroup size until we find a size which is admissible for the
+        // kernel being run or reduce the wg size to the trivial case of 1
+        // (which was separately verified to be accurate for the kernel being
+        // run)
-      while ((CurrentGroupSize() > 1))
-      {
-          // Re-generate the kernel code with the current group size
-          if (kernel) clReleaseKernel(kernel);
-          if (program) clReleaseProgram(program);
-          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
-              + FunctionCode() + KernelCode(numDestItems);
-          programLine = programSource.c_str();
-          if (create_single_kernel_helper_with_build_options(
-                  context, &program, &kernel, 1, &programLine,
-                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
-          {
-              return -1;
-          }
-          // Get work group size for the new kernel
-          error = clGetKernelWorkGroupInfo(kernel, deviceID,
-                                           CL_KERNEL_WORK_GROUP_SIZE,
-                                           sizeof(groupSize), &groupSize, NULL);
-          test_error(error,
-                     "Unable to obtain max work group size for device and "
-                     "kernel combo");
+        while ((CurrentGroupSize() > 1))
+        {
+            // Re-generate the kernel code with the current group size
+            if (kernel) clReleaseKernel(kernel);
+            if (program) clReleaseProgram(program);
+            programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+                + FunctionCode() + KernelCode(numDestItems);
+            programLine = programSource.c_str();
+            if (create_single_kernel_helper_with_build_options(
+                    context, &program, &kernel, 1, &programLine,
+                    "test_atomic_kernel", gOldAPI ? "" : nullptr))
+            {
+                return -1;
+            }
+            // Get work group size for the new kernel
+            error = clGetKernelWorkGroupInfo(
+                kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize),
+                &groupSize, NULL);
+            test_error(error,
+                       "Unable to obtain max work group size for device and "
+                       "kernel combo");
-          if (LocalMemory())
-          {
-              cl_ulong usedLocalMemory;
-              cl_ulong totalLocalMemory;
-              cl_uint maxWorkGroupSize;
+            if (LocalMemory())
+            {
+                cl_ulong usedLocalMemory;
+                cl_ulong totalLocalMemory;
+                cl_uint maxWorkGroupSize;
-              error = clGetKernelWorkGroupInfo(
-                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
-                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
-              test_error(error, "clGetKernelWorkGroupInfo failed");
+                error = clGetKernelWorkGroupInfo(
+                    kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                    sizeof(usedLocalMemory), &usedLocalMemory, NULL);
+                test_error(error, "clGetKernelWorkGroupInfo failed");
-              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
-                                      sizeof(totalLocalMemory),
-                                      &totalLocalMemory, NULL);
-              test_error(error, "clGetDeviceInfo failed");
+                error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                                        sizeof(totalLocalMemory),
+                                        &totalLocalMemory, NULL);
+                test_error(error, "clGetDeviceInfo failed");
-              // We know that each work-group is going to use typeSize *
-              // deviceThreadCount bytes of local memory
-              // so pick the maximum value for deviceThreadCount that uses all
-              // the local memory.
-              maxWorkGroupSize =
-                  ((totalLocalMemory - usedLocalMemory) / typeSize);
+                // We know that each work-group is going to use typeSize *
+                // deviceThreadCount bytes of local memory
+                // so pick the maximum value for deviceThreadCount that uses all
+                // the local memory.
+                maxWorkGroupSize =
+                    ((totalLocalMemory - usedLocalMemory) / typeSize);
-              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
-          }
-          if (CurrentGroupSize() <= groupSize)
-              break;
-          else
-              CurrentGroupSize(CurrentGroupSize() / 2);
-      }
-    if(CurrentGroupSize() > deviceThreadCount)
-      CurrentGroupSize(deviceThreadCount);
-    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-    threadCount = deviceThreadCount+hostThreadCount;
-  }
-  if (gDebug)
-  {
-      log_info("Program source:\n");
-      log_info("%s\n", programLine);
-  }
-  if(deviceThreadCount > 0)
-    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
-  if(hostThreadCount > 0)
-    log_info("\t\t(host threads %u)\n", hostThreadCount);
+                if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
+            }
+            if (CurrentGroupSize() <= groupSize)
+                break;
+            else
+                CurrentGroupSize(CurrentGroupSize() / 2);
+        }
+        if (CurrentGroupSize() > deviceThreadCount)
+            CurrentGroupSize(deviceThreadCount);
+        if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+            deviceThreadCount =
+                CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+        threadCount = deviceThreadCount + hostThreadCount;
+    }
+    if (gDebug)
+    {
+        log_info("Program source:\n");
+        log_info("%s\n", programLine);
+    }
+    if (deviceThreadCount > 0)
+        log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount,
+                 CurrentGroupSize());
+    if (hostThreadCount > 0)
+        log_info("\t\t(host threads %u)\n", hostThreadCount);
-  refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
+    refValues.resize(threadCount * NumNonAtomicVariablesPerThread());
-  // Generate ref data if we have a ref generator provided
-  d = init_genrand(gRandomSeed);
-  startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
-  if(GenerateRefs(threadCount, &startRefValues[0], d))
-  {
-    //copy ref values for host threads
-    memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    startRefValues.resize(0);
-  }
-  free_mtdata(d);
-  d = NULL;
-  // If we're given a num_results function, we need to determine how many result objects we need. If
-  // we don't have it, we assume it's just 1
-  // This is final value (exact thread count is known in this place)
-  numDestItems = NumResults(threadCount, deviceID);
-  destItems.resize(numDestItems);
-  for(cl_uint i = 0; i < numDestItems; i++)
-    destItems[i] = _startValue;
-  // Create main buffer with atomic variables (array size dependent on particular test)
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
+    // Generate ref data if we have a ref generator provided
+    d = init_genrand(gRandomSeed);
+    startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread());
+    if (GenerateRefs(threadCount, &startRefValues[0], d))
+    {
+        // copy ref values for host threads
+        memcpy(&refValues[0], &startRefValues[0],
+               sizeof(HostDataType) * threadCount
+                   * NumNonAtomicVariablesPerThread());
+    }
-      svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
-    if(!svmAtomicBuffer)
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
+        startRefValues.resize(0);
-    memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
-    streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * numDestItems, svmAtomicBuffer, NULL);
-  }
-  else
-  {
-      streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                                  typeSize * numDestItems, &destItems[0], NULL);
-  }
-  if (!streams[0])
-  {
-    log_error("ERROR: Creating output array failed!\n");
-    return -1;
-  }
-  // Create buffer for per-thread input/output data
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    else
-      svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
-    if(!svmDataBuffer)
-    {
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
-    }
-    if(startRefValues.size())
-      memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * threadCount
-                                    * NumNonAtomicVariablesPerThread(),
-                                svmDataBuffer, NULL);
-  }
-  else
-  {
-      streams[1] = clCreateBuffer(
-          context,
-          ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
-          typeSize * threadCount * NumNonAtomicVariablesPerThread(),
-          startRefValues.size() ? &startRefValues[0] : 0, NULL);
-  }
-  if (!streams[1])
-  {
-    log_error("ERROR: Creating reference array failed!\n");
-    return -1;
-  }
-  if(deviceThreadCount > 0)
-  {
-    cl_uint argInd = 0;
-    /* Set the arguments */
-    error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
-    test_error(error, "Unable to set kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
-    test_error(error, "Unable to set indexed kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    if(LocalMemory())
-    {
-      error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
-      test_error(error, "Unable to set indexed local kernel argument");
-    }
-    if(LocalRefValues())
-    {
-      error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
-      test_error(error, "Unable to set indexed kernel argument");
-    }
-  }
-  /* Configure host threads */
-  std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
-  for(unsigned int t = 0; t < hostThreadCount; t++)
-  {
-    hostThreadContexts[t].test = this;
-    hostThreadContexts[t].tid = deviceThreadCount+t;
-    hostThreadContexts[t].threadCount = threadCount;
-    hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
-    hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
-  }
+    free_mtdata(d);
+    d = NULL;
-  if(deviceThreadCount > 0)
-  {
-    /* Run the kernel */
-    threadNum[0] = deviceThreadCount;
-    groupSize = CurrentGroupSize();
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
-    test_error(error, "Unable to execute test kernel");
-    /* start device threads */
-    error = clFlush(queue);
-    test_error(error, "clFlush failed");
-  }
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. If we don't have it, we assume it's just 1 This
+    // is final value (exact thread count is known in this place)
+    numDestItems = NumResults(threadCount, deviceID);
-  /* Start host threads and wait for finish */
-  if(hostThreadCount > 0)
-    ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
+    destItems.resize(numDestItems);
+    for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue;
-  if(UseSVM())
-  {
-    error = clFinish(queue);
-    test_error(error, "clFinish failed");
-    memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
-    memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    if(deviceThreadCount > 0)
+    // Create main buffer with atomic variables (array size dependent on
+    // particular test)
+    if (UseSVM())
-      error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to read result value!");
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
-    }
-  }
-  bool dataVerified = false;
-  // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
-  // have one, it's because we're comparing ref values only
-  for(cl_uint i = 0; i < numDestItems; i++)
-  {
-    HostDataType expected;
-    if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
-      break; // no expected value function provided
-    if(expected != destItems[i])
-    {
-      std::stringstream logLine;
-      logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
-      log_error("%s", logLine.str().c_str());
-      for(i = 0; i < threadCount; i++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - ";
-        if(startRefValues.size())
-          logLine << startRefValues[i] << " -> " << refValues[i];
+        if (gUseHostPtr)
+            svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems);
-          logLine << refValues[i];
-        logLine << " --- ";
-        if(i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-    dataVerified = true;
-  }
-  bool dataCorrect = false;
-  /* Use the verify function (if provided) to also check the results */
-  if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
-  {
-    if(!dataCorrect)
-    {
-      log_error("ERROR: Reference values did not validate!\n");
-      std::stringstream logLine;
-      for(cl_uint i = 0; i < threadCount; i++)
-      for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
-        if(j == 0 && i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-  }
-  else if(!dataVerified)
-  {
-    log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
-    return -1;
-  }
-  if(OldValueCheck() &&
-    !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
-                                             // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
-  {
-    /* Re-write the starting value */
-    for(size_t i = 0; i < numDestItems; i++)
-      destItems[i] = _startValue;
-    refValues[0] = 0;
-    if(deviceThreadCount > 0)
-    {
-      error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to write starting values!");
-      /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
-      threadNum[0] = 1;
-      error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
-      test_error(error, "Unable to execute test kernel");
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
+            svmAtomicBuffer = (HostAtomicType *)clSVMAlloc(
+                context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+                typeSize * numDestItems, 0);
+        if (!svmAtomicBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                           typeSize * numDestItems, svmAtomicBuffer, NULL);
-      /* Start host thread */
-      HostFunction(0, 1, &destItems[0], &refValues[0]);
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           typeSize * numDestItems, &destItems[0], NULL);
+    }
+    if (!streams[0])
+    {
+        log_error("ERROR: Creating output array failed!\n");
+        return -1;
+    }
+    // Create buffer for per-thread input/output data
+    if (UseSVM())
+    {
+        if (gUseHostPtr)
+            svmDataBuffer = (HostDataType *)malloc(
+                typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        else
+            svmDataBuffer = (HostDataType *)clSVMAlloc(
+                context,
+                    | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS
+                                                       : 0),
+                typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0);
+        if (!svmDataBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        if (startRefValues.size())
+            memcpy(svmDataBuffer, &startRefValues[0],
+                   typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                    typeSize * threadCount
+                                        * NumNonAtomicVariablesPerThread(),
+                                    svmDataBuffer, NULL);
+    }
+    else
+    {
+        streams[1] = clCreateBuffer(
+            context,
+            ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR
+                                    : CL_MEM_READ_WRITE)),
+            typeSize * threadCount * NumNonAtomicVariablesPerThread(),
+            startRefValues.size() ? &startRefValues[0] : 0, NULL);
+    }
+    if (!streams[1])
+    {
+        log_error("ERROR: Creating reference array failed!\n");
+        return -1;
+    }
+    if (deviceThreadCount > 0)
+    {
+        cl_uint argInd = 0;
+        /* Set the arguments */
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
+        test_error(error, "Unable to set kernel argument");
+        error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems),
+                               &numDestItems);
+        test_error(error, "Unable to set indexed kernel argument");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        if (LocalMemory())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
+            test_error(error, "Unable to set indexed local kernel argument");
+        }
+        if (LocalRefValues())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++,
+                               LocalRefValues() ? typeSize
+                                       * (CurrentGroupSize()
+                                          * NumNonAtomicVariablesPerThread())
+                                                : 1,
+                               NULL);
+            test_error(error, "Unable to set indexed kernel argument");
+        }
+    }
+    /* Configure host threads */
+    std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
+    for (unsigned int t = 0; t < hostThreadCount; t++)
+    {
+        hostThreadContexts[t].test = this;
+        hostThreadContexts[t].tid = deviceThreadCount + t;
+        hostThreadContexts[t].threadCount = threadCount;
+        hostThreadContexts[t].destMemory =
+            UseSVM() ? svmAtomicBuffer : &destItems[0];
+        hostThreadContexts[t].oldValues =
+            UseSVM() ? svmDataBuffer : &refValues[0];
-    if(refValues[0] != _startValue)//destItems[0])
+    if (deviceThreadCount > 0)
-      std::stringstream logLine;
-      logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
-        " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
-      log_error("%s", logLine.str().c_str());
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
+        /* Run the kernel */
+        threadNum[0] = deviceThreadCount;
+        groupSize = CurrentGroupSize();
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                       &groupSize, 0, NULL, NULL);
+        test_error(error, "Unable to execute test kernel");
+        /* start device threads */
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
-  }
-  if(UseSVM())
-  {
-    // the buffer object must first be released before the SVM buffer is freed
-    error = clReleaseMemObject(streams[0]);
-    streams[0] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmAtomicBuffer);
+    /* Start host threads and wait for finish */
+    if (hostThreadCount > 0)
+        ThreadPool_Do(HostThreadFunction, hostThreadCount,
+                      &hostThreadContexts[0]);
+    if (UseSVM())
+    {
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems);
+        memcpy(&refValues[0], svmDataBuffer,
+               typeSize * threadCount * NumNonAtomicVariablesPerThread());
+    }
-      clSVMFree(context, svmAtomicBuffer);
-    error = clReleaseMemObject(streams[1]);
-    streams[1] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmDataBuffer);
-    else
-      clSVMFree(context, svmDataBuffer);
-  }
-  _passCount++;
-  return 0;
+    {
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                                        typeSize * numDestItems, &destItems[0],
+                                        0, NULL, NULL);
+            test_error(error, "Unable to read result value!");
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                        typeSize * deviceThreadCount
+                                            * NumNonAtomicVariablesPerThread(),
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+    }
+    bool dataVerified = false;
+    // If we have an expectedFn, then we need to generate a final value to
+    // compare against. If we don't have one, it's because we're comparing ref
+    // values only
+    for (cl_uint i = 0; i < numDestItems; i++)
+    {
+        HostDataType expected;
+        if (!ExpectedValue(expected, threadCount,
+                           startRefValues.size() ? &startRefValues[0] : 0, i))
+            break; // no expected value function provided
+        if (expected != destItems[i])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: Result " << i
+                    << " from kernel does not validate! (should be " << expected
+                    << ", was " << destItems[i] << ")\n";
+            log_error("%s", logLine.str().c_str());
+            for (i = 0; i < threadCount; i++)
+            {
+                logLine.str("");
+                logLine << " --- " << i << " - ";
+                if (startRefValues.size())
+                    logLine << startRefValues[i] << " -> " << refValues[i];
+                else
+                    logLine << refValues[i];
+                logLine << " --- ";
+                if (i < numDestItems) logLine << destItems[i];
+                logLine << "\n";
+                log_info("%s", logLine.str().c_str());
+            }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+        dataVerified = true;
+    }
+    bool dataCorrect = false;
+    /* Use the verify function (if provided) to also check the results */
+    if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
+    {
+        if (!dataCorrect)
+        {
+            log_error("ERROR: Reference values did not validate!\n");
+            std::stringstream logLine;
+            for (cl_uint i = 0; i < threadCount; i++)
+                for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
+                {
+                    logLine.str("");
+                    logLine
+                        << " --- " << i << " - "
+                        << refValues[i * NumNonAtomicVariablesPerThread() + j]
+                        << " --- ";
+                    if (j == 0 && i < numDestItems) logLine << destItems[i];
+                    logLine << "\n";
+                    log_info("%s", logLine.str().c_str());
+                }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    else if (!dataVerified)
+    {
+        log_error("ERROR: Test doesn't check total or refs; no values are "
+                  "verified!\n");
+        return -1;
+    }
+    if (OldValueCheck()
+        && !(DeclaredInProgram()
+             && !LocalMemory())) // don't test for programs scope global atomics
+                                 // 'old' value has been overwritten by previous
+                                 // clEnqueueNDRangeKernel
+    {
+        /* Re-write the starting value */
+        for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue;
+        refValues[0] = 0;
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                                         typeSize * numDestItems, &destItems[0],
+                                         0, NULL, NULL);
+            test_error(error, "Unable to write starting values!");
+            /* Run the kernel once for a single thread, so we can verify that
+             * the returned value is the original one */
+            threadNum[0] = 1;
+            error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                           threadNum, 0, NULL, NULL);
+            test_error(error, "Unable to execute test kernel");
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize,
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+        else
+        {
+            /* Start host thread */
+            HostFunction(0, 1, &destItems[0], &refValues[0]);
+        }
+        if (refValues[0] != _startValue) // destItems[0])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: atomic function operated correctly but did NOT "
+                       "return correct 'old' value "
+                       " (should have been "
+                    << destItems[0] << ", returned " << refValues[0] << ")!\n";
+            log_error("%s", logLine.str().c_str());
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    if (UseSVM())
+    {
+        // the buffer object must first be released before the SVM buffer is
+        // freed. The Wrapper Class method reset() will do that
+        streams[0].reset();
+        if (gUseHostPtr)
+            free(svmAtomicBuffer);
+        else
+            clSVMFree(context, svmAtomicBuffer);
+        streams[1].reset();
+        if (gUseHostPtr)
+            free(svmDataBuffer);
+        else
+            clSVMFree(context, svmDataBuffer);
+    }
+    _passCount++;
+    return 0;
 #endif //_COMMON_H_
diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp
index c3a190b..09c14ed 100644
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,2188 +23,3209 @@
 #include <sstream>
 #include <vector>
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestStore : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestStore
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestStore(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    OldValueCheck(false);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return threadCount;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(MemoryOrder() == MEMORY_ORDER_ACQUIRE ||
-      MemoryOrder() == MEMORY_ORDER_ACQ_REL)
-      return 0; //skip test - not applicable
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-    return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  atomic_store"+postfix+"(&destMemory[tid], tid"+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    host_atomic_store(&destMemory[tid], (HostDataType)tid, MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = (HostDataType)whichDestValue;
-    return true;
-  }
-int test_atomic_store_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestStore<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestStore<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestStore<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestStore<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_store(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_store_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_store(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_store_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestInit : public CBasicTest<HostAtomicType, HostDataType>
-  using CBasicTest<HostAtomicType, HostDataType>::OldValueCheck;
-  CBasicTestInit(TExplicitAtomicType dataType, bool useSVM) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    OldValueCheck(false);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return threadCount;
-  }
-  virtual std::string ProgramCore()
-  {
-    return
-      "  atomic_init(&destMemory[tid], tid);\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    host_atomic_init(&destMemory[tid], (HostDataType)tid);
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = (HostDataType)whichDestValue;
-    return true;
-  }
-int test_atomic_init_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestInit<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestInit<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestInit<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestInit<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_init(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_init_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_init(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_init_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestLoad : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestLoad(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    OldValueCheck(false);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return threadCount;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(MemoryOrder() == MEMORY_ORDER_RELEASE ||
-      MemoryOrder() == MEMORY_ORDER_ACQ_REL)
-      return 0; //skip test - not applicable
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-    return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual std::string ProgramCore()
-  {
-      // In the case this test is run with MEMORY_ORDER_ACQUIRE, the store
-      // should be MEMORY_ORDER_RELEASE
-      std::string memoryOrderScopeLoad = MemoryOrderScopeStr();
-      std::string memoryOrderScopeStore =
-          (MemoryOrder() == MEMORY_ORDER_ACQUIRE)
-          ? (", memory_order_release" + MemoryScopeStr())
-          : memoryOrderScopeLoad;
-      std::string postfix(memoryOrderScopeLoad.empty() ? "" : "_explicit");
-      return "  atomic_store" + postfix + "(&destMemory[tid], tid"
-          + memoryOrderScopeStore
-          + ");\n"
-            "  oldValues[tid] = atomic_load"
-          + postfix + "(&destMemory[tid]" + memoryOrderScopeLoad + ");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    host_atomic_store(&destMemory[tid], (HostDataType)tid, MEMORY_ORDER_SEQ_CST);
-    oldValues[tid] = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[tid], MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = (HostDataType)whichDestValue;
-    return true;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    for(cl_uint i = 0; i < threadCount; i++ )
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestStore(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
-      if(refValues[i] != (HostDataType)i)
-      {
-        log_error("Invalid value for thread %u\n", (cl_uint)i);
-        correct = false;
+        OldValueCheck(false);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return threadCount;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (MemoryOrder() == MEMORY_ORDER_ACQUIRE
+            || MemoryOrder() == MEMORY_ORDER_ACQ_REL)
+            return 0; // skip test - not applicable
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  atomic_store" + postfix + "(&destMemory[tid], tid"
+            + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        host_atomic_store(&destMemory[tid], (HostDataType)tid, MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = (HostDataType)whichDestValue;
         return true;
-      }
-    return true;
-  }
-int test_atomic_load_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
+int test_atomic_store_generic(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements,
+                              bool useSVM)
-  int error = 0;
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestLoad<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestLoad<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestLoad<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestLoad<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_load(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_load_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_load(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_load_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestExchange : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
-  CBasicTestExchange(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(123456);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  oldValues[tid] = atomic_exchange"+postfix+"(&destMemory[0], tid"+memoryOrderScope+");\n"
-      "  for(int i = 0; i < "+IterationsStr()+"; i++)\n"
-      "    oldValues[tid] = atomic_exchange"+postfix+"(&destMemory[0], oldValues[tid]"+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    oldValues[tid] = host_atomic_exchange(&destMemory[0], (HostDataType)tid, MemoryOrder());
-    for(int i = 0; i < Iterations(); i++)
-      oldValues[tid] = host_atomic_exchange(&destMemory[0], oldValues[tid], MemoryOrder());
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    OldValueCheck(Iterations()%2 == 0); //check is valid for even number of iterations only
-    correct = true;
-    /* We are expecting values from 0 to size-1 and initial value from atomic variable */
-    /* These values must be distributed across refValues array and atomic variable finalVaue[0] */
-    /* Any repeated value is treated as an error */
-    std::vector<bool> tidFound(threadCount);
-    bool startValueFound = false;
-    cl_uint i;
-    for(i = 0; i <= threadCount; i++)
-    {
-      cl_uint value;
-      if(i == threadCount)
-        value = (cl_uint)finalValues[0]; //additional value from atomic variable (last written)
-      else
-        value = (cl_uint)refValues[i];
-      if(value == (cl_uint)StartValue())
-      {
-        // Special initial value
-        if(startValueFound)
-        {
-          log_error("ERROR: Starting reference value (%u) occurred more thane once\n", (cl_uint)StartValue());
-          correct = false;
-          return true;
-        }
-        startValueFound = true;
-        continue;
-      }
-      if(value >= threadCount)
-      {
-        log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value);
-        correct = false;
-        return true;
-      }
-      if(tidFound[value])
-      {
-        log_error("ERROR: Value (%u) occurred more thane once\n", value);
-        correct = false;
-        return true;
-      }
-      tidFound[value] = true;
-    }
-    return true;
-  }
-int test_atomic_exchange_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestExchange<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  CBasicTestExchange<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM);
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  CBasicTestExchange<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM);
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestExchange<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestExchange<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_exchange(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_exchange_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_exchange(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_exchange_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestCompareStrong : public CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder2;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrderScope;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::Iterations;
-  using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::IterationsStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestCompareStrong(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(123456);
-    OldValueCheck(false);
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(MemoryOrder2() == MEMORY_ORDER_RELEASE ||
-      MemoryOrder2() == MEMORY_ORDER_ACQ_REL)
-      return 0; // not allowed as 'failure' argument
-    if((MemoryOrder() == MEMORY_ORDER_RELAXED && MemoryOrder2() != MEMORY_ORDER_RELAXED) ||
-      (MemoryOrder() != MEMORY_ORDER_SEQ_CST && MemoryOrder2() == MEMORY_ORDER_SEQ_CST))
-      return 0; // failure argument shall be no stronger than the success
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-    if (CheckCapabilities(MemoryScope(), MemoryOrder2()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-    return CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScope();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  ")+DataType().RegularTypeName()+" expected, previous;\n"
-      "  int successCount = 0;\n"
-      "  oldValues[tid] = tid;\n"
-      "  expected = tid;  // force failure at the beginning\n"
-      "  if(atomic_compare_exchange_strong"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+") || expected == tid)\n"
-      "    oldValues[tid] = threadCount+1; //mark unexpected success with invalid value\n"
-      "  else\n"
-      "  {\n"
-      "    for(int i = 0; i < "+IterationsStr()+" || successCount == 0; i++)\n"
-      "    {\n"
-      "      previous = expected;\n"
-      "      if(atomic_compare_exchange_strong"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+"))\n"
-      "      {\n"
-      "        oldValues[tid] = expected;\n"
-      "        successCount++;\n"
-      "      }\n"
-      "      else\n"
-      "      {\n"
-      "        if(previous == expected) // spurious failure - shouldn't occur for 'strong'\n"
-      "        {\n"
-      "          oldValues[tid] = threadCount; //mark fail with invalid value\n"
-      "          break;\n"
-      "        }\n"
-      "      }\n"
-      "    }\n"
-      "  }\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    HostDataType expected = (HostDataType)StartValue(), previous;
-    oldValues[tid] = (HostDataType)tid;
-    for(int i = 0; i < Iterations(); i++)
-    {
-      previous = expected;
-      if(host_atomic_compare_exchange(&destMemory[0], &expected, oldValues[tid], MemoryOrder(), MemoryOrder2()))
-        oldValues[tid] = expected;
-      else
-      {
-        if(previous == expected) // shouldn't occur for 'strong'
-        {
-          oldValues[tid] = threadCount; //mark fail with invalid value
-        }
-      }
-    }
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    /* We are expecting values from 0 to size-1 and initial value from atomic variable */
-    /* These values must be distributed across refValues array and atomic variable finalVaue[0] */
-    /* Any repeated value is treated as an error */
-    std::vector<bool> tidFound(threadCount);
-    bool startValueFound = false;
-    cl_uint i;
-    for(i = 0; i <= threadCount; i++)
-    {
-      cl_uint value;
-      if(i == threadCount)
-        value = (cl_uint)finalValues[0]; //additional value from atomic variable (last written)
-      else
-        value = (cl_uint)refValues[i];
-      if(value == (cl_uint)StartValue())
-      {
-        // Special initial value
-        if(startValueFound)
-        {
-          log_error("ERROR: Starting reference value (%u) occurred more thane once\n", (cl_uint)StartValue());
-          correct = false;
-          return true;
-        }
-        startValueFound = true;
-        continue;
-      }
-      if(value >= threadCount)
-      {
-        if(value == threadCount)
-          log_error("ERROR: Spurious failure detected for atomic_compare_exchange_strong\n");
-        log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value);
-        correct = false;
-        return true;
-      }
-      if(tidFound[value])
-      {
-        log_error("ERROR: Value (%u) occurred more thane once\n", value);
-        correct = false;
-        return true;
-      }
-      tidFound[value] = true;
-    }
-    return true;
-  }
-int test_atomic_compare_exchange_strong_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestCompareStrong<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareStrong<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM);
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareStrong<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM);
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareStrong<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM);
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_compare_exchange_strong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_compare_exchange_strong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestCompareWeak : public CBasicTestCompareStrong<HostAtomicType, HostDataType>
-  using CBasicTestCompareStrong<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestCompareStrong<HostAtomicType, HostDataType>::MemoryOrderScope;
-  using CBasicTestCompareStrong<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestCompareStrong<HostAtomicType, HostDataType>::Iterations;
-  using CBasicTestCompareStrong<HostAtomicType, HostDataType>::IterationsStr;
-  CBasicTestCompareWeak(TExplicitAtomicType dataType, bool useSVM) : CBasicTestCompareStrong<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScope();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  ")+DataType().RegularTypeName()+" expected , previous;\n"
-      "  int successCount = 0;\n"
-      "  oldValues[tid] = tid;\n"
-      "  expected = tid;  // force failure at the beginning\n"
-      "  if(atomic_compare_exchange_weak"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+") || expected == tid)\n"
-      "    oldValues[tid] = threadCount+1; //mark unexpected success with invalid value\n"
-      "  else\n"
-      "  {\n"
-      "    for(int i = 0; i < "+IterationsStr()+" || successCount == 0; i++)\n"
-      "    {\n"
-      "      previous = expected;\n"
-      "      if(atomic_compare_exchange_weak"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+"))\n"
-      "      {\n"
-      "        oldValues[tid] = expected;\n"
-      "        successCount++;\n"
-      "      }\n"
-      "    }\n"
-      "  }\n";
-  }
-int test_atomic_compare_exchange_weak_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestCompareWeak<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareWeak<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM);
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareWeak<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM);
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  CBasicTestCompareWeak<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM);
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchAdd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  oldValues[tid] = atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n"+
-      "  atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n"
-      "  atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n"
-      "  atomic_fetch_add"+postfix+"(&destMemory[0], (("+DataType().AddSubOperandTypeName()+")tid + 3) << (sizeof("+DataType().AddSubOperandTypeName()+")-1)*8"+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    oldValues[tid] = host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder());
-    host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder());
-    host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder());
-    host_atomic_fetch_add(&destMemory[0], ((HostDataType)tid + 3) << (sizeof(HostDataType)-1)*8, MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = StartValue();
-    for(cl_uint i = 0; i < threadCount; i++)
-      expected += ((HostDataType)i+3)*3+(((HostDataType)i + 3) << (sizeof(HostDataType)-1)*8);
-    return true;
-  }
-int test_atomic_fetch_add_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchAdd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchSub : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  oldValues[tid] = atomic_fetch_sub"+postfix+"(&destMemory[0], tid + 3 +((("+DataType().AddSubOperandTypeName()+")tid + 3) << (sizeof("+DataType().AddSubOperandTypeName()+")-1)*8)"+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    oldValues[tid] = host_atomic_fetch_sub(&destMemory[0], (HostDataType)tid + 3+(((HostDataType)tid + 3) << (sizeof(HostDataType)-1)*8), MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = StartValue();
-    for(cl_uint i = 0; i < threadCount; i++)
-      expected -= (HostDataType)i + 3 +(((HostDataType)i + 3) << (sizeof(HostDataType)-1)*8);
-    return true;
-  }
-int test_atomic_fetch_sub_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchSub<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchOr : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  CBasicTestFetchOr(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(0);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    cl_uint numBits = DataType().Size(deviceID) * 8;
-    return (threadCount + numBits - 1) / numBits;
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("    size_t numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n"
-      "    int whichResult = tid / numBits;\n"
-      "    int bitIndex = tid - (whichResult * numBits);\n"
-      "\n"
-      "    oldValues[tid] = atomic_fetch_or"+postfix+"(&destMemory[whichResult], (("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    size_t numBits = sizeof(HostDataType) * 8;
-    size_t whichResult = tid / numBits;
-    size_t bitIndex = tid - (whichResult * numBits);
-    oldValues[tid] = host_atomic_fetch_or(&destMemory[whichResult], ((HostDataType)1 << bitIndex), MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    cl_uint numValues = (threadCount + (sizeof(HostDataType)*8-1)) / (sizeof(HostDataType)*8);
-    if(whichDestValue < numValues - 1)
-    {
-      expected = ~(HostDataType)0;
-      return true;
-    }
-    // Last item doesn't get or'ed on every bit, so we have to mask away
-    cl_uint numBits = threadCount - whichDestValue * (sizeof(HostDataType)*8);
-    expected = StartValue();
-    for(cl_uint i = 0; i < numBits; i++)
-      expected |= ((HostDataType)1 << i);
-    return true;
-  }
-int test_atomic_fetch_or_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchOr<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchXor : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  CBasicTestFetchXor(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue((HostDataType)0x2f08ab418ba0541LL);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  int numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n"
-      "  int bitIndex = (numBits-1)*(tid+1)/threadCount;\n"
-      "\n"
-      "  oldValues[tid] = atomic_fetch_xor"+postfix+"(&destMemory[0], (("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    int numBits = sizeof(HostDataType) * 8;
-    int bitIndex = (numBits-1)*(tid+1)/threadCount;
-    oldValues[tid] = host_atomic_fetch_xor(&destMemory[0], ((HostDataType)1 << bitIndex), MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    int numBits = sizeof(HostDataType)*8;
-    expected = StartValue();
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      int bitIndex = (numBits-1)*(i+1)/threadCount;
-      expected ^= ((HostDataType)1 << bitIndex);
-    }
-    return true;
-  }
-int test_atomic_fetch_xor_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchXor<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchAnd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  CBasicTestFetchAnd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(~(HostDataType)0);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    cl_uint numBits = DataType().Size(deviceID) * 8;
-    return (threadCount + numBits - 1) / numBits;
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  size_t numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n"
-      "  int whichResult = tid / numBits;\n"
-      "  int bitIndex = tid - (whichResult * numBits);\n"
-      "\n"
-      "  oldValues[tid] = atomic_fetch_and"+postfix+"(&destMemory[whichResult], ~(("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    size_t numBits = sizeof(HostDataType) * 8;
-    size_t whichResult = tid / numBits;
-    size_t bitIndex = tid - (whichResult * numBits);
-    oldValues[tid] = host_atomic_fetch_and(&destMemory[whichResult], ~((HostDataType)1 << bitIndex), MemoryOrder());
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    cl_uint numValues = (threadCount + (sizeof(HostDataType)*8-1)) / (sizeof(HostDataType)*8);
-    if(whichDestValue < numValues - 1)
-    {
-      expected = 0;
-      return true;
-    }
-    // Last item doesn't get and'ed on every bit, so we have to mask away
-    size_t numBits = threadCount - whichDestValue * (sizeof(HostDataType)*8);
-    expected = StartValue();
-    for(size_t i = 0; i < numBits; i++)
-      expected &= ~((HostDataType)1 << i);
-    return true;
-  }
-int test_atomic_fetch_and_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchOrAnd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
-  CBasicTestFetchOrAnd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(0);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return 1+(threadCount-1)/(DataType().Size(deviceID)*8);
-  }
-  // each thread modifies (with OR and AND operations) and verifies
-  // only one bit in atomic variable
-  // other bits are modified by other threads but it must not affect current thread operation
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  int bits = sizeof(")+DataType().RegularTypeName()+")*8;\n"+
-      "  size_t valueInd = tid/bits;\n"
-      "  "+DataType().RegularTypeName()+" value, bitMask = ("+DataType().RegularTypeName()+")1 << tid%bits;\n"
-      "  oldValues[tid] = 0;\n"
-      "  for(int i = 0; i < "+IterationsStr()+"; i++)\n"
-      "  {\n"
-      "    value = atomic_fetch_or"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n"
-      "    if(value & bitMask) // bit should be set to 0\n"
-      "      oldValues[tid]++;\n"
-      "    value = atomic_fetch_and"+postfix+"(destMemory+valueInd, ~bitMask"+memoryOrderScope+");\n"
-      "    if(!(value & bitMask)) // bit should be set to 1\n"
-      "      oldValues[tid]++;\n"
-      "  }\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    int bits = sizeof(HostDataType)*8;
-    size_t valueInd = tid/bits;
-    HostDataType value, bitMask = (HostDataType)1 << tid%bits;
-    oldValues[tid] = 0;
-    for(int i = 0; i < Iterations(); i++)
-    {
-      value = host_atomic_fetch_or(destMemory+valueInd, bitMask, MemoryOrder());
-      if(value & bitMask) // bit should be set to 0
-        oldValues[tid]++;
-      value = host_atomic_fetch_and(destMemory+valueInd, ~bitMask, MemoryOrder());
-      if(!(value & bitMask)) // bit should be set to 1
-        oldValues[tid]++;
-    }
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = 0;
-    return true;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      if(refValues[i] > 0)
-      {
-        log_error("Thread %d found %d mismatch(es)\n", i, (cl_uint)refValues[i]);
-        correct = false;
-      }
-    }
-    return true;
-  }
-int test_atomic_fetch_orand_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchOrAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_orand(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_orand_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_orand(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_orand_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchXor2 : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
-  CBasicTestFetchXor2(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(0);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return 1+(threadCount-1)/(DataType().Size(deviceID)*8);
-  }
-  // each thread modifies (with XOR operation) and verifies
-  // only one bit in atomic variable
-  // other bits are modified by other threads but it must not affect current thread operation
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      std::string("  int bits = sizeof(")+DataType().RegularTypeName()+")*8;\n"+
-      "  size_t valueInd = tid/bits;\n"
-      "  "+DataType().RegularTypeName()+" value, bitMask = ("+DataType().RegularTypeName()+")1 << tid%bits;\n"
-      "  oldValues[tid] = 0;\n"
-      "  for(int i = 0; i < "+IterationsStr()+"; i++)\n"
-      "  {\n"
-      "    value = atomic_fetch_xor"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n"
-      "    if(value & bitMask) // bit should be set to 0\n"
-      "      oldValues[tid]++;\n"
-      "    value = atomic_fetch_xor"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n"
-      "    if(!(value & bitMask)) // bit should be set to 1\n"
-      "      oldValues[tid]++;\n"
-      "  }\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    int bits = sizeof(HostDataType)*8;
-    size_t valueInd = tid/bits;
-    HostDataType value, bitMask = (HostDataType)1 << tid%bits;
-    oldValues[tid] = 0;
-    for(int i = 0; i < Iterations(); i++)
-    {
-      value = host_atomic_fetch_xor(destMemory+valueInd, bitMask, MemoryOrder());
-      if(value & bitMask) // bit should be set to 0
-        oldValues[tid]++;
-      value = host_atomic_fetch_xor(destMemory+valueInd, bitMask, MemoryOrder());
-      if(!(value & bitMask)) // bit should be set to 1
-        oldValues[tid]++;
-    }
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = 0;
-    return true;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      if(refValues[i] > 0)
-      {
-        log_error("Thread %d found %d mismatches\n", i, (cl_uint)refValues[i]);
-        correct = false;
-      }
-    }
-    return true;
-  }
-int test_atomic_fetch_xor2_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchXor2<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  CBasicTestFetchXor2<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM);
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  CBasicTestFetchXor2<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM);
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_xor2_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_xor2_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchMin : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  CBasicTestFetchMin(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(DataType().MaxValue());
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  oldValues[tid] = atomic_fetch_min"+postfix+"(&destMemory[0], oldValues[tid] "+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    oldValues[tid] = host_atomic_fetch_min(&destMemory[0], oldValues[tid], MemoryOrder());
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      startRefValues[i] = genrand_int32(d);
-      if(sizeof(HostDataType) >= 8)
-        startRefValues[i] |= (HostDataType)genrand_int32(d) << 16;
-    }
-    return true;
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = StartValue();
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      if(startRefValues[ i ] < expected)
-        expected = startRefValues[ i ];
-    }
-    return true;
-  }
-int test_atomic_fetch_min_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchMin<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFetchMax : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  CBasicTestFetchMax(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(DataType().MinValue());
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    return
-      "  oldValues[tid] = atomic_fetch_max"+postfix+"(&destMemory[0], oldValues[tid] "+memoryOrderScope+");\n";
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    oldValues[tid] = host_atomic_fetch_max(&destMemory[0], oldValues[tid], MemoryOrder());
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      startRefValues[i] = genrand_int32(d);
-      if(sizeof(HostDataType) >= 8)
-        startRefValues[i] |= (HostDataType)genrand_int32(d) << 16;
-    }
-    return true;
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = StartValue();
-    for(cl_uint i = 0; i < threadCount; i++)
-    {
-      if(startRefValues[ i ] > expected)
-        expected = startRefValues[ i ];
-    }
-    return true;
-  }
-int test_atomic_fetch_max_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  CBasicTestFetchMax<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
-int test_atomic_fetch_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_fetch_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFlag : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  static const HostDataType CRITICAL_SECTION_NOT_VISITED = 1000000000;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
-  CBasicTestFlag(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(0);
-    OldValueCheck(false);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return threadCount;
-  }
-  TExplicitMemoryOrderType MemoryOrderForClear()
-  {
-    // Memory ordering for atomic_flag_clear function
-    // ("shall not be memory_order_acquire nor memory_order_acq_rel")
-    if(MemoryOrder() == MEMORY_ORDER_ACQUIRE)
-    if (MemoryOrder() == MEMORY_ORDER_ACQ_REL)
-    return MemoryOrder();
-  }
-  std::string MemoryOrderScopeStrForClear()
-  {
-    std::string orderStr;
-    if (MemoryOrder() != MEMORY_ORDER_EMPTY)
-      orderStr = std::string(", ") + get_memory_order_type_name(MemoryOrderForClear());
-    return orderStr + MemoryScopeStr();
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
-                                cl_command_queue queue)
-  {
-      // This test assumes support for the memory_scope_device scope in the case
-      // that LocalMemory() == false. Therefore we should skip this test in that
-      // configuration on a 3.0 driver since supporting the memory_scope_device
-      // scope is optionaly.
-      if (get_device_cl_version(deviceID) >= Version{ 3, 0 })
-      {
-          if (!LocalMemory()
-              && !(gAtomicFenceCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE))
-          {
-              log_info(
-                  "Skipping atomic_flag test due to use of atomic_scope_device "
-                  "which is optionally not supported on this device\n");
-              return 0; // skip test - not applicable
-          }
-      }
-      return CBasicTestMemOrderScope<HostAtomicType,
-                                     HostDataType>::ExecuteSingleTest(deviceID,
-                                                                      context,
-                                                                      queue);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::string memoryOrderScope = MemoryOrderScopeStr();
-    std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-    std::string program =
-      "  uint cnt, stop = 0;\n"
-      "  for(cnt = 0; !stop && cnt < threadCount; cnt++) // each thread must find critical section where it is the first visitor\n"
-      "  {\n"
-      "    bool set = atomic_flag_test_and_set" + postfix + "(&destMemory[cnt]" + memoryOrderScope + ");\n";
-    if (MemoryOrder() == MEMORY_ORDER_RELAXED || MemoryOrder() == MEMORY_ORDER_RELEASE)
-      program += "    atomic_work_item_fence(" +
-                 std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
-                 "memory_order_acquire," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
-                 ");\n";
-    program +=
-      "    if (!set)\n"
-      "    {\n";
-    if (LocalMemory())
-      program += "      uint csIndex = get_enqueued_local_size(0)*get_group_id(0)+cnt;\n";
-    else
-      program += "      uint csIndex = cnt;\n";
-    std::ostringstream csNotVisited;
-    program +=
-      "      // verify that thread is the first visitor\n"
-      "      if(oldValues[csIndex] == "+csNotVisited.str()+")\n"
-      "      {\n"
-      "        oldValues[csIndex] = tid; // set the winner id for this critical section\n"
-      "        stop = 1;\n"
-      "      }\n";
-    if (MemoryOrder() == MEMORY_ORDER_ACQUIRE || MemoryOrder() == MEMORY_ORDER_RELAXED)
-      program += "      atomic_work_item_fence(" +
-                 std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
-                 "memory_order_release," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
-                 ");\n";
-    program +=
-      "      atomic_flag_clear" + postfix + "(&destMemory[cnt]" + MemoryOrderScopeStrForClear() + ");\n"
-      "    }\n"
-      "  }\n";
-    return program;
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    cl_uint cnt, stop = 0;
-    for (cnt = 0; !stop && cnt < threadCount; cnt++) // each thread must find critical section where it is the first visitor\n"
-    {
-      if (!host_atomic_flag_test_and_set(&destMemory[cnt], MemoryOrder()))
-      {
-        cl_uint csIndex = cnt;
-        // verify that thread is the first visitor\n"
-        if (oldValues[csIndex] == CRITICAL_SECTION_NOT_VISITED)
-        {
-          oldValues[csIndex] = tid; // set the winner id for this critical section\n"
-          stop = 1;
-        }
-        host_atomic_flag_clear(&destMemory[cnt], MemoryOrderForClear());
-      }
-    }
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    expected = StartValue();
-    return true;
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    for(cl_uint i = 0 ; i < threadCount; i++)
-      startRefValues[i] = CRITICAL_SECTION_NOT_VISITED;
-    return true;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    /* We are expecting unique values from 0 to threadCount-1 (each critical section must be visited) */
-    /* These values must be distributed across refValues array */
-    std::vector<bool> tidFound(threadCount);
-    cl_uint i;
-    for (i = 0; i < threadCount; i++)
-    {
-      cl_uint value = (cl_uint)refValues[i];
-      {
-        // Special initial value
-        log_error("ERROR: Critical section %u not visited\n", i);
-        correct = false;
-        return true;
-      }
-      if (value >= threadCount)
-      {
-        log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value);
-        correct = false;
-        return true;
-      }
-      if (tidFound[value])
-      {
-        log_error("ERROR: Value (%u) occurred more thane once\n", value);
-        correct = false;
-        return true;
-      }
-      tidFound[value] = true;
-    }
-    return true;
-  }
-int test_atomic_flag_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
-  int error = 0;
-  EXECUTE_TEST(error, test_flag.Execute(deviceID, context, queue, num_elements));
-  return error;
-int test_atomic_flag(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_flag_generic(deviceID, context, queue, num_elements, false);
-int test_svm_atomic_flag(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  return test_atomic_flag_generic(deviceID, context, queue, num_elements, true);
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestFence : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-  struct TestDefinition {
-    bool op1IsFence;
-    TExplicitMemoryOrderType op1MemOrder;
-    bool op2IsFence;
-    TExplicitMemoryOrderType op2MemOrder;
-  };
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DeclaredInProgram;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UsedInFunction;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::CurrentGroupSize;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalRefValues;
-  CBasicTestFence(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-    StartValue(0);
-    OldValueCheck(false);
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return threadCount;
-  }
-  virtual cl_uint NumNonAtomicVariablesPerThread()
-  {
-    if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
-      return 1;
-    if (LocalMemory())
-    {
-      if (gIsEmbedded)
-      {
-        if (CurrentGroupSize() > 1024)
-          CurrentGroupSize(1024);
-        return 1; //1KB of local memory required by spec. Clamp group size to 1k and allow 1 variable per thread
-      }
-      else
-        return 32 * 1024 / 8 / CurrentGroupSize() - 1; //32KB of local memory required by spec
-    }
-    return 256;
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName;
-    if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
-      testName += "seq_cst fence, ";
-    else
-      testName += std::string(get_memory_order_type_name(_subCase.op1MemOrder)).substr(sizeof("memory_order"))
-        + (_subCase.op1IsFence ? " fence" : " atomic") + " synchronizes-with "
-        + std::string(get_memory_order_type_name(_subCase.op2MemOrder)).substr(sizeof("memory_order"))
-        + (_subCase.op2IsFence ? " fence" : " atomic") + ", ";
-    testName += CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    testName += std::string(", ") + std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
-    return testName;
-  }
-  virtual bool SVMDataBufferAllSVMConsistent()
-  {
-      // Although memory_scope_all_devices doesn't mention SVM it is just an
-      // alias for memory_scope_all_svm_devices.  So both scopes interact with
-      // SVM allocations, on devices that support those, just the same.
-      return MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
-          || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
     int error = 0;
-    // execute 3 (maximum) sub cases for each memory order
-    for (_subCaseId = 0; _subCaseId < 3; _subCaseId++)
+                                                        useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_float.Execute(deviceID, context, queue, num_elements));
+    CBasicTestStore<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+    EXECUTE_TEST(error,
+                 test_double.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-      EXECUTE_TEST(error, (CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
+        CBasicTestStore<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestStore<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestStore<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestStore<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
     return error;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(DeclaredInProgram() || UsedInFunction())
-      return 0; //skip test - not applicable - no overloaded fence functions for different address spaces
-    if(MemoryOrder() == MEMORY_ORDER_EMPTY ||
-      MemoryScope() == MEMORY_SCOPE_EMPTY) // empty 'scope' not required since opencl20-openclc-rev15
-      return 0; //skip test - not applicable
-    if((UseSVM() || gHost)
-      && LocalMemory())
-      return 0; // skip test - not applicable for SVM and local memory
-    struct TestDefinition acqTests[] = {
-      // {op1IsFence, op1MemOrder, op2IsFence, op2MemOrder}
-    };
-    struct TestDefinition relTests[] = {
-    };
-    struct TestDefinition arTests[] = {
-    };
-    switch (MemoryOrder())
-    {
-      if (_subCaseId >= sizeof(acqTests) / sizeof(struct TestDefinition))
-        return 0;
-      _subCase = acqTests[_subCaseId];
-      break;
-      if (_subCaseId >= sizeof(relTests) / sizeof(struct TestDefinition))
-        return 0;
-      _subCase = relTests[_subCaseId];
-      break;
-      if (_subCaseId >= sizeof(arTests) / sizeof(struct TestDefinition))
-        return 0;
-      _subCase = arTests[_subCaseId];
-      break;
-      if (_subCaseId != 0) // one special case only
-        return 0;
-      break;
-    default:
-      return 0;
-    }
-    LocalRefValues(LocalMemory());
-    return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
-  {
-    std::string header;
-    if(gOldAPI)
-    {
-      if(MemoryScope() == MEMORY_SCOPE_EMPTY)
-      {
-        header += "#define atomic_work_item_fence(x,y)                        mem_fence(x)\n";
-      }
-      else
-      {
-        header += "#define atomic_work_item_fence(x,y,z)                      mem_fence(x)\n";
-      }
-    }
-    return header+CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
-  }
-  virtual std::string ProgramCore()
-  {
-    std::ostringstream naValues;
-    naValues << NumNonAtomicVariablesPerThread();
-    std::string program, fenceType, nonAtomic;
-    if (LocalMemory())
-    {
-      program = "  size_t myId = get_local_id(0), hisId = get_local_size(0)-1-myId;\n";
-      fenceType = "CLK_LOCAL_MEM_FENCE";
-      nonAtomic = "localValues";
-    }
-    else
-    {
-      program = "  size_t myId = tid, hisId = threadCount-1-tid;\n";
-      fenceType = "CLK_GLOBAL_MEM_FENCE";
-      nonAtomic = "oldValues";
-    }
-    if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
-    {
-      // All threads are divided into pairs.
-      // Each thread has its own atomic variable and performs the following actions:
-      // - increments its own variable
-      // - performs fence operation to propagate its value and to see value from other thread
-      // - reads value from other thread's variable
-      // - repeats the above steps when both values are the same (and less than 1000000)
-      // - stores the last value read from other thread (in additional variable)
-      // At the end of execution at least one thread should know the last value from other thread
-      program += std::string("") +
-        "  " + DataType().RegularTypeName() + " myValue = 0, hisValue; \n"
-        "  do {\n"
-        "    myValue++;\n"
-        "    atomic_store_explicit(&destMemory[myId], myValue, memory_order_relaxed" + MemoryScopeStr() + ");\n"
-        "    atomic_work_item_fence(" + fenceType + ", memory_order_seq_cst" + MemoryScopeStr() + "); \n"
-        "    hisValue = atomic_load_explicit(&destMemory[hisId], memory_order_relaxed" + MemoryScopeStr() + ");\n"
-        "  } while(myValue == hisValue && myValue < 1000000);\n"
-        "  " + nonAtomic + "[myId] = hisValue; \n";
-    }
-    else
-    {
-      // Each thread modifies one of its non-atomic variables, increments value of its atomic variable
-      // and reads values from another thread in typical synchronizes-with scenario with:
-      // - non-atomic variable (at index A) modification (value change from 0 to A)
-      // - release operation (additional fence or within atomic) + atomic variable modification (value A)
-      // - atomic variable read (value B) + acquire operation (additional fence or within atomic)
-      // - non-atomic variable (at index B) read (value C)
-      // Each thread verifies dependency between atomic and non-atomic value read from another thread
-      // The following condition must be true: B == C
-      program += std::string("") +
-        "  " + DataType().RegularTypeName() + " myValue = 0, hisAtomicValue, hisValue; \n"
-        "  do {\n"
-        "    myValue++;\n"
-        "    " + nonAtomic + "[myId*" + naValues.str() +"+myValue] = myValue;\n";
-      if (_subCase.op1IsFence)
-        program += std::string("") +
-        "    atomic_work_item_fence(" + fenceType + ", " + get_memory_order_type_name(_subCase.op1MemOrder) + MemoryScopeStr() + "); \n"
-        "    atomic_store_explicit(&destMemory[myId], myValue, memory_order_relaxed" + MemoryScopeStr() + ");\n";
-      else
-        program += std::string("") +
-        "    atomic_store_explicit(&destMemory[myId], myValue, " + get_memory_order_type_name(_subCase.op1MemOrder) + MemoryScopeStr() + ");\n";
-      if (_subCase.op2IsFence)
-        program += std::string("") +
-        "    hisAtomicValue = atomic_load_explicit(&destMemory[hisId], memory_order_relaxed" + MemoryScopeStr() + ");\n"
-        "    atomic_work_item_fence(" + fenceType + ", " + get_memory_order_type_name(_subCase.op2MemOrder) + MemoryScopeStr() + "); \n";
-      else
-        program += std::string("") +
-        "    hisAtomicValue = atomic_load_explicit(&destMemory[hisId], " + get_memory_order_type_name(_subCase.op2MemOrder) + MemoryScopeStr() + ");\n";
-      program +=
-        "    hisValue = " + nonAtomic + "[hisId*" + naValues.str() + "+hisAtomicValue]; \n";
-      if (LocalMemory())
-        program += "    hisId = (hisId+1)%get_local_size(0);\n";
-      else
-        program += "    hisId = (hisId+1)%threadCount;\n";
-      program +=
-        "  } while(hisAtomicValue == hisValue && myValue < "+naValues.str()+"-1);\n"
-        "  if(hisAtomicValue != hisValue)\n"
-        "  { // fail\n"
-        "    atomic_store(&destMemory[myId], myValue-1);\n";
-      if (LocalMemory())
-        program += "    hisId = (hisId+get_local_size(0)-1)%get_local_size(0);\n";
-      else
-        program += "    hisId = (hisId+threadCount-1)%threadCount;\n";
-      program +=
-        "    if(myValue+1 < " + naValues.str() + ")\n"
-        "      " + nonAtomic + "[myId*" + naValues.str() + "+myValue+1] = hisId;\n"
-        "    if(myValue+2 < " + naValues.str() + ")\n"
-        "      " + nonAtomic + "[myId*" + naValues.str() + "+myValue+2] = hisAtomicValue;\n"
-        "    if(myValue+3 < " + naValues.str() + ")\n"
-        "      " + nonAtomic + "[myId*" + naValues.str() + "+myValue+3] = hisValue;\n";
-      if (gDebug)
-      {
-        program +=
-          "    printf(\"WI %d: atomic value (%d) at index %d is different than non-atomic value (%d)\\n\", tid, hisAtomicValue, hisId, hisValue);\n";
-      }
-      program +=
-        "  }\n";
-    }
-    return program;
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    size_t myId = tid, hisId = threadCount - 1 - tid;
-    if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
-    {
-      HostDataType myValue = 0, hisValue;
-      // CPU thread typically starts faster - wait for GPU thread
-      myValue++;
-      host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_SEQ_CST);
-      while (host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_SEQ_CST) == 0);
-      do {
-        myValue++;
-        host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_RELAXED);
-        host_atomic_thread_fence(MemoryOrder());
-        hisValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_RELAXED);
-      } while (myValue == hisValue && hisValue < 1000000);
-      oldValues[tid] = hisValue;
-    }
-    else
-    {
-      HostDataType myValue = 0, hisAtomicValue, hisValue;
-      do {
-        myValue++;
-        oldValues[myId*NumNonAtomicVariablesPerThread()+myValue] = myValue;
-        if (_subCase.op1IsFence)
-        {
-          host_atomic_thread_fence(_subCase.op1MemOrder);
-          host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_RELAXED);
-        }
-        else
-          host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, _subCase.op1MemOrder);
-        if (_subCase.op2IsFence)
-        {
-          hisAtomicValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_RELAXED);
-          host_atomic_thread_fence(_subCase.op2MemOrder);
-        }
-        else
-          hisAtomicValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], _subCase.op2MemOrder);
-        hisValue = oldValues[hisId*NumNonAtomicVariablesPerThread() + hisAtomicValue];
-        hisId = (hisId + 1) % threadCount;
-      } while(hisAtomicValue == hisValue && myValue < (HostDataType)NumNonAtomicVariablesPerThread()-1);
-      if(hisAtomicValue != hisValue)
-      { // fail
-        host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue-1, MEMORY_ORDER_SEQ_CST);
-        if (gDebug)
-        {
-          hisId = (hisId + threadCount - 1) % threadCount;
-          printf("WI %d: atomic value (%d) at index %d is different than non-atomic value (%d)\n", tid, hisAtomicValue, hisId, hisValue);
-        }
-      }
-    }
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    for(cl_uint i = 0 ; i < threadCount*NumNonAtomicVariablesPerThread(); i++)
-      startRefValues[i] = 0;
-    return true;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    correct = true;
-    cl_uint workSize = LocalMemory() ? CurrentGroupSize() : threadCount;
-    for(cl_uint workOffset = 0; workOffset < threadCount; workOffset+= workSize)
-    {
-      if(workOffset+workSize > threadCount)
-        // last workgroup (host threads)
-        workSize = threadCount-workOffset;
-      for(cl_uint i = 0 ; i < workSize && workOffset+i < threadCount; i++)
-      {
-        HostAtomicType myValue = finalValues[workOffset + i];
-        if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
-        {
-          HostDataType hisValue = refValues[workOffset + i];
-          if (myValue == hisValue)
-          {
-            // a draw - both threads should reach final value 1000000
-            if (myValue != 1000000)
-            {
-              log_error("ERROR: Invalid reference value #%u (%d instead of 1000000)\n", workOffset + i, myValue);
-              correct = false;
-              return true;
-            }
-          }
-          else
-          {
-            //slower thread (in total order of seq_cst operations) must know last value written by faster thread
-            HostAtomicType hisRealValue = finalValues[workOffset + workSize - 1 - i];
-            HostDataType myValueReadByHim = refValues[workOffset + workSize - 1 - i];
-            // who is the winner? - thread with lower private counter value
-            if (myValue == hisRealValue) // forbidden result - fence doesn't work
-            {
-              log_error("ERROR: Atomic counter values #%u and #%u are the same (%u)\n", workOffset + i, workOffset + workSize - 1 - i, myValue);
-              log_error("ERROR: Both threads have outdated values read from another thread (%u and %u)\n", hisValue, myValueReadByHim);
-              correct = false;
-              return true;
-            }
-            if (myValue > hisRealValue) // I'm slower
-            {
-              if (hisRealValue != hisValue)
-              {
-                log_error("ERROR: Invalid reference value #%u (%d instead of %d)\n", workOffset + i, hisValue, hisRealValue);
-                log_error("ERROR: Slower thread #%u should know value written by faster thread #%u\n", workOffset + i, workOffset + workSize - 1 - i);
-                correct = false;
-                return true;
-              }
-            }
-            else // I'm faster
-            {
-              if (myValueReadByHim != myValue)
-              {
-                log_error("ERROR: Invalid reference value #%u (%d instead of %d)\n", workOffset + workSize - 1 - i, myValueReadByHim, myValue);
-                log_error("ERROR: Slower thread #%u should know value written by faster thread #%u\n", workOffset + workSize - 1 - i, workOffset + i);
-                correct = false;
-                return true;
-              }
-            }
-          }
-        }
-        else
-        {
-          if (myValue != NumNonAtomicVariablesPerThread()-1)
-          {
-            log_error("ERROR: Invalid atomic value #%u (%d instead of %d)\n", workOffset + i, myValue, NumNonAtomicVariablesPerThread()-1);
-            log_error("ERROR: Thread #%u observed invalid values in other thread's variables\n", workOffset + i, myValue);
-            correct = false;
-            return true;
-          }
-        }
-      }
+int test_atomic_store(cl_device_id deviceID, cl_context context,
+                      cl_command_queue queue, int num_elements)
+    return test_atomic_store_generic(deviceID, context, queue, num_elements,
+                                     false);
+int test_svm_atomic_store(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_store_generic(deviceID, context, queue, num_elements,
+                                     true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestInit : public CBasicTest<HostAtomicType, HostDataType> {
+    using CBasicTest<HostAtomicType, HostDataType>::OldValueCheck;
+    CBasicTestInit(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
+    {
+        OldValueCheck(false);
-    return true;
-  }
-  int _subCaseId;
-  struct TestDefinition _subCase;
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return threadCount;
+    }
+    virtual std::string ProgramCore()
+    {
+        return "  atomic_init(&destMemory[tid], tid);\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        host_atomic_init(&destMemory[tid], (HostDataType)tid);
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = (HostDataType)whichDestValue;
+        return true;
+    }
-int test_atomic_fence_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM)
+int test_atomic_init_generic(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements,
+                             bool useSVM)
-  int error = 0;
-  CBasicTestFence<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
-  EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements));
-  EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements));
-  if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
-  {
-    CBasicTestFence<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  else
-  {
-    CBasicTestFence<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
-    EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
-    EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements));
-    CBasicTestFence<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
-    EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
-  }
-  return error;
+    int error = 0;
+    CBasicTestInit<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_float.Execute(deviceID, context, queue, num_elements));
+    CBasicTestInit<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+    EXECUTE_TEST(error,
+                 test_double.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestInit<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestInit<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestInit<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestInit<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
-int test_atomic_fence(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_atomic_init(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
-  return test_atomic_fence_generic(deviceID, context, queue, num_elements, false);
+    return test_atomic_init_generic(deviceID, context, queue, num_elements,
+                                    false);
-int test_svm_atomic_fence(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_svm_atomic_init(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
-  return test_atomic_fence_generic(deviceID, context, queue, num_elements, true);
+    return test_atomic_init_generic(deviceID, context, queue, num_elements,
+                                    true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestLoad
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestLoad(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        OldValueCheck(false);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return threadCount;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (MemoryOrder() == MEMORY_ORDER_RELEASE
+            || MemoryOrder() == MEMORY_ORDER_ACQ_REL)
+            return 0; // skip test - not applicable
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    virtual std::string ProgramCore()
+    {
+        // In the case this test is run with MEMORY_ORDER_ACQUIRE, the store
+        // should be MEMORY_ORDER_RELEASE
+        std::string memoryOrderScopeLoad = MemoryOrderScopeStr();
+        std::string memoryOrderScopeStore =
+            (MemoryOrder() == MEMORY_ORDER_ACQUIRE)
+            ? (", memory_order_release" + MemoryScopeStr())
+            : memoryOrderScopeLoad;
+        std::string postfix(memoryOrderScopeLoad.empty() ? "" : "_explicit");
+        return "  atomic_store" + postfix + "(&destMemory[tid], tid"
+            + memoryOrderScopeStore
+            + ");\n"
+              "  oldValues[tid] = atomic_load"
+            + postfix + "(&destMemory[tid]" + memoryOrderScopeLoad + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        host_atomic_store(&destMemory[tid], (HostDataType)tid,
+                          MEMORY_ORDER_SEQ_CST);
+        oldValues[tid] = host_atomic_load<HostAtomicType, HostDataType>(
+            &destMemory[tid], MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = (HostDataType)whichDestValue;
+        return true;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            if (refValues[i] != (HostDataType)i)
+            {
+                log_error("Invalid value for thread %u\n", (cl_uint)i);
+                correct = false;
+                return true;
+            }
+        }
+        return true;
+    }
+int test_atomic_load_generic(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements,
+                             bool useSVM)
+    int error = 0;
+    CBasicTestLoad<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_float.Execute(deviceID, context, queue, num_elements));
+    CBasicTestLoad<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+    EXECUTE_TEST(error,
+                 test_double.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestLoad<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestLoad<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestLoad<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestLoad<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_load(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
+    return test_atomic_load_generic(deviceID, context, queue, num_elements,
+                                    false);
+int test_svm_atomic_load(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
+    return test_atomic_load_generic(deviceID, context, queue, num_elements,
+                                    true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestExchange
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
+    CBasicTestExchange(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(123456);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  oldValues[tid] = atomic_exchange" + postfix
+            + "(&destMemory[0], tid" + memoryOrderScope
+            + ");\n"
+              "  for(int i = 0; i < "
+            + IterationsStr()
+            + "; i++)\n"
+              "    oldValues[tid] = atomic_exchange"
+            + postfix + "(&destMemory[0], oldValues[tid]" + memoryOrderScope
+            + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        oldValues[tid] = host_atomic_exchange(&destMemory[0], (HostDataType)tid,
+                                              MemoryOrder());
+        for (int i = 0; i < Iterations(); i++)
+            oldValues[tid] = host_atomic_exchange(
+                &destMemory[0], oldValues[tid], MemoryOrder());
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        OldValueCheck(
+            Iterations() % 2
+            == 0); // check is valid for even number of iterations only
+        correct = true;
+        /* We are expecting values from 0 to size-1 and initial value from
+         * atomic variable */
+        /* These values must be distributed across refValues array and atomic
+         * variable finalVaue[0] */
+        /* Any repeated value is treated as an error */
+        std::vector<bool> tidFound(threadCount);
+        bool startValueFound = false;
+        cl_uint i;
+        for (i = 0; i <= threadCount; i++)
+        {
+            cl_uint value;
+            if (i == threadCount)
+                value = (cl_uint)finalValues[0]; // additional value from atomic
+                                                 // variable (last written)
+            else
+                value = (cl_uint)refValues[i];
+            if (value == (cl_uint)StartValue())
+            {
+                // Special initial value
+                if (startValueFound)
+                {
+                    log_error("ERROR: Starting reference value (%u) occurred "
+                              "more thane once\n",
+                              (cl_uint)StartValue());
+                    correct = false;
+                    return true;
+                }
+                startValueFound = true;
+                continue;
+            }
+            if (value >= threadCount)
+            {
+                log_error(
+                    "ERROR: Reference value %u outside of valid range! (%u)\n",
+                    i, value);
+                correct = false;
+                return true;
+            }
+            if (tidFound[value])
+            {
+                log_error("ERROR: Value (%u) occurred more thane once\n",
+                          value);
+                correct = false;
+                return true;
+            }
+            tidFound[value] = true;
+        }
+        return true;
+    }
+int test_atomic_exchange_generic(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements,
+                                 bool useSVM)
+    int error = 0;
+    CBasicTestExchange<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+    CBasicTestExchange<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT,
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+    CBasicTestExchange<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG,
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestExchange<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    CBasicTestExchange<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
+        TYPE_ATOMIC_FLOAT, useSVM);
+    EXECUTE_TEST(error,
+                 test_float.Execute(deviceID, context, queue, num_elements));
+    CBasicTestExchange<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+    EXECUTE_TEST(error,
+                 test_double.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestExchange<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestExchange<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_exchange(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
+    return test_atomic_exchange_generic(deviceID, context, queue, num_elements,
+                                        false);
+int test_svm_atomic_exchange(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
+    return test_atomic_exchange_generic(deviceID, context, queue, num_elements,
+                                        true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestCompareStrong
+    : public CBasicTestMemOrder2Scope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder2;
+    using CBasicTestMemOrder2Scope<HostAtomicType,
+                                   HostDataType>::MemoryOrderScope;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::Iterations;
+    using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::IterationsStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestCompareStrong(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>(dataType,
+                                                                 useSVM)
+    {
+        StartValue(123456);
+        OldValueCheck(false);
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (MemoryOrder2() == MEMORY_ORDER_RELEASE
+            || MemoryOrder2() == MEMORY_ORDER_ACQ_REL)
+            return 0; // not allowed as 'failure' argument
+        if ((MemoryOrder() == MEMORY_ORDER_RELAXED
+             && MemoryOrder2() != MEMORY_ORDER_RELAXED)
+            || (MemoryOrder() != MEMORY_ORDER_SEQ_CST
+                && MemoryOrder2() == MEMORY_ORDER_SEQ_CST))
+            return 0; // failure argument shall be no stronger than the success
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+        if (CheckCapabilities(MemoryScope(), MemoryOrder2())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+        return CBasicTestMemOrder2Scope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScope();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  ") + DataType().RegularTypeName()
+            + " expected, previous;\n"
+              "  int successCount = 0;\n"
+              "  oldValues[tid] = tid;\n"
+              "  expected = tid;  // force failure at the beginning\n"
+              "  if(atomic_compare_exchange_strong"
+            + postfix + "(&destMemory[0], &expected, oldValues[tid]"
+            + memoryOrderScope
+            + ") || expected == tid)\n"
+              "    oldValues[tid] = threadCount+1; //mark unexpected success "
+              "with invalid value\n"
+              "  else\n"
+              "  {\n"
+              "    for(int i = 0; i < "
+            + IterationsStr()
+            + " || successCount == 0; i++)\n"
+              "    {\n"
+              "      previous = expected;\n"
+              "      if(atomic_compare_exchange_strong"
+            + postfix + "(&destMemory[0], &expected, oldValues[tid]"
+            + memoryOrderScope
+            + "))\n"
+              "      {\n"
+              "        oldValues[tid] = expected;\n"
+              "        successCount++;\n"
+              "      }\n"
+              "      else\n"
+              "      {\n"
+              "        if(previous == expected) // spurious failure - "
+              "shouldn't occur for 'strong'\n"
+              "        {\n"
+              "          oldValues[tid] = threadCount; //mark fail with "
+              "invalid value\n"
+              "          break;\n"
+              "        }\n"
+              "      }\n"
+              "    }\n"
+              "  }\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        HostDataType expected = (HostDataType)StartValue(), previous;
+        oldValues[tid] = (HostDataType)tid;
+        for (int i = 0; i < Iterations(); i++)
+        {
+            previous = expected;
+            if (host_atomic_compare_exchange(&destMemory[0], &expected,
+                                             oldValues[tid], MemoryOrder(),
+                                             MemoryOrder2()))
+                oldValues[tid] = expected;
+            else
+            {
+                if (previous == expected) // shouldn't occur for 'strong'
+                {
+                    oldValues[tid] = threadCount; // mark fail with invalid
+                                                  // value
+                }
+            }
+        }
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        /* We are expecting values from 0 to size-1 and initial value from
+         * atomic variable */
+        /* These values must be distributed across refValues array and atomic
+         * variable finalVaue[0] */
+        /* Any repeated value is treated as an error */
+        std::vector<bool> tidFound(threadCount);
+        bool startValueFound = false;
+        cl_uint i;
+        for (i = 0; i <= threadCount; i++)
+        {
+            cl_uint value;
+            if (i == threadCount)
+                value = (cl_uint)finalValues[0]; // additional value from atomic
+                                                 // variable (last written)
+            else
+                value = (cl_uint)refValues[i];
+            if (value == (cl_uint)StartValue())
+            {
+                // Special initial value
+                if (startValueFound)
+                {
+                    log_error("ERROR: Starting reference value (%u) occurred "
+                              "more thane once\n",
+                              (cl_uint)StartValue());
+                    correct = false;
+                    return true;
+                }
+                startValueFound = true;
+                continue;
+            }
+            if (value >= threadCount)
+            {
+                if (value == threadCount)
+                    log_error("ERROR: Spurious failure detected for "
+                              "atomic_compare_exchange_strong\n");
+                log_error(
+                    "ERROR: Reference value %u outside of valid range! (%u)\n",
+                    i, value);
+                correct = false;
+                return true;
+            }
+            if (tidFound[value])
+            {
+                log_error("ERROR: Value (%u) occurred more thane once\n",
+                          value);
+                correct = false;
+                return true;
+            }
+            tidFound[value] = true;
+        }
+        return true;
+    }
+int test_atomic_compare_exchange_strong_generic(cl_device_id deviceID,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements, bool useSVM)
+    int error = 0;
+    CBasicTestCompareStrong<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                                useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareStrong<HOST_ATOMIC_UINT, HOST_UINT> test_uint(
+        TYPE_ATOMIC_UINT, useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareStrong<HOST_ATOMIC_LONG, HOST_LONG> test_long(
+        TYPE_ATOMIC_LONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareStrong<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32>
+            test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64>
+            test_size_t(TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_compare_exchange_strong(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+    return test_atomic_compare_exchange_strong_generic(deviceID, context, queue,
+                                                       num_elements, false);
+int test_svm_atomic_compare_exchange_strong(cl_device_id deviceID,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements)
+    return test_atomic_compare_exchange_strong_generic(deviceID, context, queue,
+                                                       num_elements, true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestCompareWeak
+    : public CBasicTestCompareStrong<HostAtomicType, HostDataType> {
+    using CBasicTestCompareStrong<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestCompareStrong<HostAtomicType,
+                                  HostDataType>::MemoryOrderScope;
+    using CBasicTestCompareStrong<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestCompareStrong<HostAtomicType, HostDataType>::Iterations;
+    using CBasicTestCompareStrong<HostAtomicType, HostDataType>::IterationsStr;
+    CBasicTestCompareWeak(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestCompareStrong<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScope();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  ") + DataType().RegularTypeName()
+            + " expected , previous;\n"
+              "  int successCount = 0;\n"
+              "  oldValues[tid] = tid;\n"
+              "  expected = tid;  // force failure at the beginning\n"
+              "  if(atomic_compare_exchange_weak"
+            + postfix + "(&destMemory[0], &expected, oldValues[tid]"
+            + memoryOrderScope
+            + ") || expected == tid)\n"
+              "    oldValues[tid] = threadCount+1; //mark unexpected success "
+              "with invalid value\n"
+              "  else\n"
+              "  {\n"
+              "    for(int i = 0; i < "
+            + IterationsStr()
+            + " || successCount == 0; i++)\n"
+              "    {\n"
+              "      previous = expected;\n"
+              "      if(atomic_compare_exchange_weak"
+            + postfix + "(&destMemory[0], &expected, oldValues[tid]"
+            + memoryOrderScope
+            + "))\n"
+              "      {\n"
+              "        oldValues[tid] = expected;\n"
+              "        successCount++;\n"
+              "      }\n"
+              "    }\n"
+              "  }\n";
+    }
+int test_atomic_compare_exchange_weak_generic(cl_device_id deviceID,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements, bool useSVM)
+    int error = 0;
+    CBasicTestCompareWeak<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareWeak<HOST_ATOMIC_UINT, HOST_UINT> test_uint(
+        TYPE_ATOMIC_UINT, useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareWeak<HOST_ATOMIC_LONG, HOST_LONG> test_long(
+        TYPE_ATOMIC_LONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestCompareWeak<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+    return test_atomic_compare_exchange_weak_generic(deviceID, context, queue,
+                                                     num_elements, false);
+int test_svm_atomic_compare_exchange_weak(cl_device_id deviceID,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
+    return test_atomic_compare_exchange_weak_generic(deviceID, context, queue,
+                                                     num_elements, true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchAdd
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  oldValues[tid] = atomic_fetch_add" + postfix
+            + "(&destMemory[0], (" + DataType().AddSubOperandTypeName()
+            + ")tid + 3" + memoryOrderScope + ");\n" + "  atomic_fetch_add"
+            + postfix + "(&destMemory[0], ("
+            + DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope
+            + ");\n"
+              "  atomic_fetch_add"
+            + postfix + "(&destMemory[0], ("
+            + DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope
+            + ");\n"
+              "  atomic_fetch_add"
+            + postfix + "(&destMemory[0], (("
+            + DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof("
+            + DataType().AddSubOperandTypeName() + ")-1)*8" + memoryOrderScope
+            + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        oldValues[tid] = host_atomic_fetch_add(
+            &destMemory[0], (HostDataType)tid + 3, MemoryOrder());
+        host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3,
+                              MemoryOrder());
+        host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3,
+                              MemoryOrder());
+        host_atomic_fetch_add(&destMemory[0],
+                              ((HostDataType)tid + 3)
+                                  << (sizeof(HostDataType) - 1) * 8,
+                              MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = StartValue();
+        for (cl_uint i = 0; i < threadCount; i++)
+            expected += ((HostDataType)i + 3) * 3
+                + (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8);
+        return true;
+    }
+int test_atomic_fetch_add_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchAdd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchAdd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_add(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_add(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchSub
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  oldValues[tid] = atomic_fetch_sub" + postfix
+            + "(&destMemory[0], tid + 3 +((("
+            + DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof("
+            + DataType().AddSubOperandTypeName() + ")-1)*8)" + memoryOrderScope
+            + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        oldValues[tid] = host_atomic_fetch_sub(
+            &destMemory[0],
+            (HostDataType)tid + 3
+                + (((HostDataType)tid + 3) << (sizeof(HostDataType) - 1) * 8),
+            MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = StartValue();
+        for (cl_uint i = 0; i < threadCount; i++)
+            expected -= (HostDataType)i + 3
+                + (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8);
+        return true;
+    }
+int test_atomic_fetch_sub_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchSub<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchSub<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_sub(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_sub(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchOr
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    CBasicTestFetchOr(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(0);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        cl_uint numBits = DataType().Size(deviceID) * 8;
+        return (threadCount + numBits - 1) / numBits;
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("    size_t numBits = sizeof(")
+            + DataType().RegularTypeName()
+            + ") * 8;\n"
+              "    int whichResult = tid / numBits;\n"
+              "    int bitIndex = tid - (whichResult * numBits);\n"
+              "\n"
+              "    oldValues[tid] = atomic_fetch_or"
+            + postfix + "(&destMemory[whichResult], (("
+            + DataType().RegularTypeName() + ")1 << bitIndex) "
+            + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        size_t numBits = sizeof(HostDataType) * 8;
+        size_t whichResult = tid / numBits;
+        size_t bitIndex = tid - (whichResult * numBits);
+        oldValues[tid] =
+            host_atomic_fetch_or(&destMemory[whichResult],
+                                 ((HostDataType)1 << bitIndex), MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        cl_uint numValues = (threadCount + (sizeof(HostDataType) * 8 - 1))
+            / (sizeof(HostDataType) * 8);
+        if (whichDestValue < numValues - 1)
+        {
+            expected = ~(HostDataType)0;
+            return true;
+        }
+        // Last item doesn't get or'ed on every bit, so we have to mask away
+        cl_uint numBits =
+            threadCount - whichDestValue * (sizeof(HostDataType) * 8);
+        expected = StartValue();
+        for (cl_uint i = 0; i < numBits; i++)
+            expected |= ((HostDataType)1 << i);
+        return true;
+    }
+int test_atomic_fetch_or_generic(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements,
+                                 bool useSVM)
+    int error = 0;
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchOr<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOr<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOr<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_or(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements,
+                                        false);
+int test_svm_atomic_fetch_or(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements,
+                                        true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchXor
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    CBasicTestFetchXor(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue((HostDataType)0x2f08ab418ba0541LL);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  int numBits = sizeof(")
+            + DataType().RegularTypeName()
+            + ") * 8;\n"
+              "  int bitIndex = (numBits-1)*(tid+1)/threadCount;\n"
+              "\n"
+              "  oldValues[tid] = atomic_fetch_xor"
+            + postfix + "(&destMemory[0], ((" + DataType().RegularTypeName()
+            + ")1 << bitIndex) " + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        int numBits = sizeof(HostDataType) * 8;
+        int bitIndex = (numBits - 1) * (tid + 1) / threadCount;
+        oldValues[tid] = host_atomic_fetch_xor(
+            &destMemory[0], ((HostDataType)1 << bitIndex), MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        int numBits = sizeof(HostDataType) * 8;
+        expected = StartValue();
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            int bitIndex = (numBits - 1) * (i + 1) / threadCount;
+            expected ^= ((HostDataType)1 << bitIndex);
+        }
+        return true;
+    }
+int test_atomic_fetch_xor_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchXor<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchXor<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_xor(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_xor(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchAnd
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    CBasicTestFetchAnd(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(~(HostDataType)0);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        cl_uint numBits = DataType().Size(deviceID) * 8;
+        return (threadCount + numBits - 1) / numBits;
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  size_t numBits = sizeof(")
+            + DataType().RegularTypeName()
+            + ") * 8;\n"
+              "  int whichResult = tid / numBits;\n"
+              "  int bitIndex = tid - (whichResult * numBits);\n"
+              "\n"
+              "  oldValues[tid] = atomic_fetch_and"
+            + postfix + "(&destMemory[whichResult], ~(("
+            + DataType().RegularTypeName() + ")1 << bitIndex) "
+            + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        size_t numBits = sizeof(HostDataType) * 8;
+        size_t whichResult = tid / numBits;
+        size_t bitIndex = tid - (whichResult * numBits);
+        oldValues[tid] = host_atomic_fetch_and(&destMemory[whichResult],
+                                               ~((HostDataType)1 << bitIndex),
+                                               MemoryOrder());
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        cl_uint numValues = (threadCount + (sizeof(HostDataType) * 8 - 1))
+            / (sizeof(HostDataType) * 8);
+        if (whichDestValue < numValues - 1)
+        {
+            expected = 0;
+            return true;
+        }
+        // Last item doesn't get and'ed on every bit, so we have to mask away
+        size_t numBits =
+            threadCount - whichDestValue * (sizeof(HostDataType) * 8);
+        expected = StartValue();
+        for (size_t i = 0; i < numBits; i++)
+            expected &= ~((HostDataType)1 << i);
+        return true;
+    }
+int test_atomic_fetch_and_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_and(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_and(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchOrAnd
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
+    CBasicTestFetchOrAnd(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(0);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return 1 + (threadCount - 1) / (DataType().Size(deviceID) * 8);
+    }
+    // each thread modifies (with OR and AND operations) and verifies
+    // only one bit in atomic variable
+    // other bits are modified by other threads but it must not affect current
+    // thread operation
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  int bits = sizeof(")
+            + DataType().RegularTypeName() + ")*8;\n"
+            + "  size_t valueInd = tid/bits;\n"
+              "  "
+            + DataType().RegularTypeName() + " value, bitMask = ("
+            + DataType().RegularTypeName()
+            + ")1 << tid%bits;\n"
+              "  oldValues[tid] = 0;\n"
+              "  for(int i = 0; i < "
+            + IterationsStr()
+            + "; i++)\n"
+              "  {\n"
+              "    value = atomic_fetch_or"
+            + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope
+            + ");\n"
+              "    if(value & bitMask) // bit should be set to 0\n"
+              "      oldValues[tid]++;\n"
+              "    value = atomic_fetch_and"
+            + postfix + "(destMemory+valueInd, ~bitMask" + memoryOrderScope
+            + ");\n"
+              "    if(!(value & bitMask)) // bit should be set to 1\n"
+              "      oldValues[tid]++;\n"
+              "  }\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        int bits = sizeof(HostDataType) * 8;
+        size_t valueInd = tid / bits;
+        HostDataType value, bitMask = (HostDataType)1 << tid % bits;
+        oldValues[tid] = 0;
+        for (int i = 0; i < Iterations(); i++)
+        {
+            value = host_atomic_fetch_or(destMemory + valueInd, bitMask,
+                                         MemoryOrder());
+            if (value & bitMask) // bit should be set to 0
+                oldValues[tid]++;
+            value = host_atomic_fetch_and(destMemory + valueInd, ~bitMask,
+                                          MemoryOrder());
+            if (!(value & bitMask)) // bit should be set to 1
+                oldValues[tid]++;
+        }
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = 0;
+        return true;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            if (refValues[i] > 0)
+            {
+                log_error("Thread %d found %d mismatch(es)\n", i,
+                          (cl_uint)refValues[i]);
+                correct = false;
+            }
+        }
+        return true;
+    }
+int test_atomic_fetch_orand_generic(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements,
+                                    bool useSVM)
+    int error = 0;
+    CBasicTestFetchOrAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                             useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchOrAnd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(
+        TYPE_ATOMIC_UINT, useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchOrAnd<HOST_ATOMIC_LONG, HOST_LONG> test_long(
+        TYPE_ATOMIC_LONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchOrAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_orand(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_orand_generic(deviceID, context, queue,
+                                           num_elements, false);
+int test_svm_atomic_fetch_orand(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_orand_generic(deviceID, context, queue,
+                                           num_elements, true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchXor2
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr;
+    CBasicTestFetchXor2(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(0);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return 1 + (threadCount - 1) / (DataType().Size(deviceID) * 8);
+    }
+    // each thread modifies (with XOR operation) and verifies
+    // only one bit in atomic variable
+    // other bits are modified by other threads but it must not affect current
+    // thread operation
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return std::string("  int bits = sizeof(")
+            + DataType().RegularTypeName() + ")*8;\n"
+            + "  size_t valueInd = tid/bits;\n"
+              "  "
+            + DataType().RegularTypeName() + " value, bitMask = ("
+            + DataType().RegularTypeName()
+            + ")1 << tid%bits;\n"
+              "  oldValues[tid] = 0;\n"
+              "  for(int i = 0; i < "
+            + IterationsStr()
+            + "; i++)\n"
+              "  {\n"
+              "    value = atomic_fetch_xor"
+            + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope
+            + ");\n"
+              "    if(value & bitMask) // bit should be set to 0\n"
+              "      oldValues[tid]++;\n"
+              "    value = atomic_fetch_xor"
+            + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope
+            + ");\n"
+              "    if(!(value & bitMask)) // bit should be set to 1\n"
+              "      oldValues[tid]++;\n"
+              "  }\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        int bits = sizeof(HostDataType) * 8;
+        size_t valueInd = tid / bits;
+        HostDataType value, bitMask = (HostDataType)1 << tid % bits;
+        oldValues[tid] = 0;
+        for (int i = 0; i < Iterations(); i++)
+        {
+            value = host_atomic_fetch_xor(destMemory + valueInd, bitMask,
+                                          MemoryOrder());
+            if (value & bitMask) // bit should be set to 0
+                oldValues[tid]++;
+            value = host_atomic_fetch_xor(destMemory + valueInd, bitMask,
+                                          MemoryOrder());
+            if (!(value & bitMask)) // bit should be set to 1
+                oldValues[tid]++;
+        }
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = 0;
+        return true;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            if (refValues[i] > 0)
+            {
+                log_error("Thread %d found %d mismatches\n", i,
+                          (cl_uint)refValues[i]);
+                correct = false;
+            }
+        }
+        return true;
+    }
+int test_atomic_fetch_xor2_generic(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements,
+                                   bool useSVM)
+    int error = 0;
+    CBasicTestFetchXor2<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                            useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                               useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                               useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchXor2<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_xor2(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_xor2_generic(deviceID, context, queue,
+                                          num_elements, false);
+int test_svm_atomic_fetch_xor2(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_xor2_generic(deviceID, context, queue,
+                                          num_elements, true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchMin
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    CBasicTestFetchMin(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(DataType().MaxValue());
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  oldValues[tid] = atomic_fetch_min" + postfix
+            + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        oldValues[tid] = host_atomic_fetch_min(&destMemory[0], oldValues[tid],
+                                               MemoryOrder());
+    }
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
+    {
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            startRefValues[i] = genrand_int32(d);
+            if (sizeof(HostDataType) >= 8)
+                startRefValues[i] |= (HostDataType)genrand_int32(d) << 16;
+        }
+        return true;
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = StartValue();
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            if (startRefValues[i] < expected) expected = startRefValues[i];
+        }
+        return true;
+    }
+int test_atomic_fetch_min_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchMin<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchMin<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_min(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_min(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFetchMax
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    CBasicTestFetchMax(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(DataType().MinValue());
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        return "  oldValues[tid] = atomic_fetch_max" + postfix
+            + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n";
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        oldValues[tid] = host_atomic_fetch_max(&destMemory[0], oldValues[tid],
+                                               MemoryOrder());
+    }
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
+    {
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            startRefValues[i] = genrand_int32(d);
+            if (sizeof(HostDataType) >= 8)
+                startRefValues[i] |= (HostDataType)genrand_int32(d) << 16;
+        }
+        return true;
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = StartValue();
+        for (cl_uint i = 0; i < threadCount; i++)
+        {
+            if (startRefValues[i] > expected) expected = startRefValues[i];
+        }
+        return true;
+    }
+int test_atomic_fetch_max_generic(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements,
+                                  bool useSVM)
+    int error = 0;
+    CBasicTestFetchMax<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT,
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+    CBasicTestFetchMax<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(
+        TYPE_ATOMIC_ULONG, useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64>
+            test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64>
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64>
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fetch_max(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements,
+                                         false);
+int test_svm_atomic_fetch_max(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
+    return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements,
+                                         true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFlag
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    static const HostDataType CRITICAL_SECTION_NOT_VISITED = 1000000000;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::MemoryOrderScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
+    CBasicTestFlag(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(0);
+        OldValueCheck(false);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return threadCount;
+    }
+    TExplicitMemoryOrderType MemoryOrderForClear()
+    {
+        // Memory ordering for atomic_flag_clear function
+        // ("shall not be memory_order_acquire nor memory_order_acq_rel")
+        if (MemoryOrder() == MEMORY_ORDER_ACQUIRE) return MEMORY_ORDER_RELAXED;
+        if (MemoryOrder() == MEMORY_ORDER_ACQ_REL) return MEMORY_ORDER_RELEASE;
+        return MemoryOrder();
+    }
+    std::string MemoryOrderScopeStrForClear()
+    {
+        std::string orderStr;
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            orderStr = std::string(", ")
+                + get_memory_order_type_name(MemoryOrderForClear());
+        return orderStr + MemoryScopeStr();
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        // This test assumes support for the memory_scope_device scope in the
+        // case that LocalMemory() == false. Therefore we should skip this test
+        // in that configuration on a 3.0 driver since supporting the
+        // memory_scope_device scope is optionaly.
+        if (get_device_cl_version(deviceID) >= Version{ 3, 0 })
+        {
+            if (!LocalMemory()
+                && !(gAtomicFenceCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE))
+            {
+                log_info("Skipping atomic_flag test due to use of "
+                         "atomic_scope_device "
+                         "which is optionally not supported on this device\n");
+                return 0; // skip test - not applicable
+            }
+        }
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::string memoryOrderScope = MemoryOrderScopeStr();
+        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+        std::string program =
+            "  uint cnt, stop = 0;\n"
+            "  for(cnt = 0; !stop && cnt < threadCount; cnt++) // each thread "
+            "must find critical section where it is the first visitor\n"
+            "  {\n"
+            "    bool set = atomic_flag_test_and_set"
+            + postfix + "(&destMemory[cnt]" + memoryOrderScope + ");\n";
+        if (MemoryOrder() == MEMORY_ORDER_RELAXED
+            || MemoryOrder() == MEMORY_ORDER_RELEASE || LocalMemory())
+            program += "    atomic_work_item_fence("
+                + std::string(
+                           LocalMemory()
+                               ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, "
+                               : "CLK_GLOBAL_MEM_FENCE, ")
+                + "memory_order_acquire,"
+                + std::string(LocalMemory()
+                                  ? "memory_scope_work_group"
+                                  : (UseSVM() ? "memory_scope_all_svm_devices"
+                                              : "memory_scope_device"))
+                + ");\n";
+        program += "    if (!set)\n"
+                   "    {\n";
+        if (LocalMemory())
+            program += "      uint csIndex = "
+                       "get_enqueued_local_size(0)*get_group_id(0)+cnt;\n";
+        else
+            program += "      uint csIndex = cnt;\n";
+        std::ostringstream csNotVisited;
+        csNotVisited << CRITICAL_SECTION_NOT_VISITED;
+        program += "      // verify that thread is the first visitor\n"
+                   "      if(oldValues[csIndex] == "
+            + csNotVisited.str()
+            + ")\n"
+              "      {\n"
+              "        oldValues[csIndex] = tid; // set the winner id for this "
+              "critical section\n"
+              "        stop = 1;\n"
+              "      }\n";
+        if (MemoryOrder() == MEMORY_ORDER_ACQUIRE
+            || MemoryOrder() == MEMORY_ORDER_RELAXED || LocalMemory())
+            program += "      atomic_work_item_fence("
+                + std::string(
+                           LocalMemory()
+                               ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, "
+                               : "CLK_GLOBAL_MEM_FENCE, ")
+                + "memory_order_release,"
+                + std::string(LocalMemory()
+                                  ? "memory_scope_work_group"
+                                  : (UseSVM() ? "memory_scope_all_svm_devices"
+                                              : "memory_scope_device"))
+                + ");\n";
+        program += "      atomic_flag_clear" + postfix + "(&destMemory[cnt]"
+            + MemoryOrderScopeStrForClear()
+            + ");\n"
+              "    }\n"
+              "  }\n";
+        return program;
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        cl_uint cnt, stop = 0;
+        for (cnt = 0; !stop && cnt < threadCount;
+             cnt++) // each thread must find critical section where it is the
+                    // first visitor\n"
+        {
+            if (!host_atomic_flag_test_and_set(&destMemory[cnt], MemoryOrder()))
+            {
+                cl_uint csIndex = cnt;
+                // verify that thread is the first visitor\n"
+                if (oldValues[csIndex] == CRITICAL_SECTION_NOT_VISITED)
+                {
+                    oldValues[csIndex] =
+                        tid; // set the winner id for this critical section\n"
+                    stop = 1;
+                }
+                host_atomic_flag_clear(&destMemory[cnt], MemoryOrderForClear());
+            }
+        }
+    }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        expected = StartValue();
+        return true;
+    }
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
+    {
+        for (cl_uint i = 0; i < threadCount; i++)
+            startRefValues[i] = CRITICAL_SECTION_NOT_VISITED;
+        return true;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        /* We are expecting unique values from 0 to threadCount-1 (each critical
+         * section must be visited) */
+        /* These values must be distributed across refValues array */
+        std::vector<bool> tidFound(threadCount);
+        cl_uint i;
+        for (i = 0; i < threadCount; i++)
+        {
+            cl_uint value = (cl_uint)refValues[i];
+            if (value == CRITICAL_SECTION_NOT_VISITED)
+            {
+                // Special initial value
+                log_error("ERROR: Critical section %u not visited\n", i);
+                correct = false;
+                return true;
+            }
+            if (value >= threadCount)
+            {
+                log_error(
+                    "ERROR: Reference value %u outside of valid range! (%u)\n",
+                    i, value);
+                correct = false;
+                return true;
+            }
+            if (tidFound[value])
+            {
+                log_error("ERROR: Value (%u) occurred more thane once\n",
+                          value);
+                correct = false;
+                return true;
+            }
+            tidFound[value] = true;
+        }
+        return true;
+    }
+int test_atomic_flag_generic(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements,
+                             bool useSVM)
+    int error = 0;
+                                                          useSVM);
+    EXECUTE_TEST(error,
+                 test_flag.Execute(deviceID, context, queue, num_elements));
+    return error;
+int test_atomic_flag(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
+    return test_atomic_flag_generic(deviceID, context, queue, num_elements,
+                                    false);
+int test_svm_atomic_flag(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
+    return test_atomic_flag_generic(deviceID, context, queue, num_elements,
+                                    true);
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestFence
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+    struct TestDefinition
+    {
+        bool op1IsFence;
+        TExplicitMemoryOrderType op1MemOrder;
+        bool op2IsFence;
+        TExplicitMemoryOrderType op2MemOrder;
+    };
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::DeclaredInProgram;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UsedInFunction;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType,
+                                  HostDataType>::CurrentGroupSize;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalRefValues;
+    CBasicTestFence(TExplicitAtomicType dataType, bool useSVM)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {
+        StartValue(0);
+        OldValueCheck(false);
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return threadCount;
+    }
+    virtual cl_uint NumNonAtomicVariablesPerThread()
+    {
+        if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) return 1;
+        if (LocalMemory())
+        {
+            if (gIsEmbedded)
+            {
+                if (CurrentGroupSize() > 512) CurrentGroupSize(512);
+                return 2; // 1KB of local memory required by spec. Clamp group
+                          // size to 512 and allow 2 variables per thread
+            }
+            else
+                return 32 * 1024 / 8 / CurrentGroupSize()
+                    - 1; // 32KB of local memory required by spec
+        }
+        return 256;
+    }
+    virtual std::string SingleTestName()
+    {
+        std::string testName;
+        if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
+            testName += "seq_cst fence, ";
+        else
+            testName +=
+                std::string(get_memory_order_type_name(_subCase.op1MemOrder))
+                    .substr(sizeof("memory_order"))
+                + (_subCase.op1IsFence ? " fence" : " atomic")
+                + " synchronizes-with "
+                + std::string(get_memory_order_type_name(_subCase.op2MemOrder))
+                      .substr(sizeof("memory_order"))
+                + (_subCase.op2IsFence ? " fence" : " atomic") + ", ";
+        testName += CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        testName += std::string(", ")
+            + std::string(get_memory_scope_type_name(MemoryScope()))
+                  .substr(sizeof("memory"));
+        return testName;
+    }
+    virtual bool SVMDataBufferAllSVMConsistent()
+    {
+        // Although memory_scope_all_devices doesn't mention SVM it is just an
+        // alias for memory_scope_all_svm_devices.  So both scopes interact with
+        // SVM allocations, on devices that support those, just the same.
+        return MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
+            || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        int error = 0;
+        // execute 3 (maximum) sub cases for each memory order
+        for (_subCaseId = 0; _subCaseId < 3; _subCaseId++)
+        {
+            EXECUTE_TEST(
+                error,
+                (CBasicTestMemOrderScope<HostAtomicType, HostDataType>::
+                     ExecuteForEachParameterSet(deviceID, context, queue)));
+        }
+        return error;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (DeclaredInProgram() || UsedInFunction())
+            return 0; // skip test - not applicable - no overloaded fence
+                      // functions for different address spaces
+        if (MemoryOrder() == MEMORY_ORDER_EMPTY
+            || MemoryScope()
+                == MEMORY_SCOPE_EMPTY) // empty 'scope' not required since
+                                       // opencl20-openclc-rev15
+            return 0; // skip test - not applicable
+        if ((UseSVM() || gHost) && LocalMemory())
+            return 0; // skip test - not applicable for SVM and local memory
+        struct TestDefinition acqTests[] = {
+            // {op1IsFence, op1MemOrder, op2IsFence, op2MemOrder}
+            { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE },
+            { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE },
+            { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQUIRE }
+        };
+        struct TestDefinition relTests[] = {
+            { true, MEMORY_ORDER_RELEASE, false, MEMORY_ORDER_ACQUIRE },
+            { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL }
+        };
+        struct TestDefinition arTests[] = {
+            { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL },
+            { true, MEMORY_ORDER_ACQ_REL, false, MEMORY_ORDER_ACQUIRE },
+            { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQ_REL }
+        };
+        switch (MemoryOrder())
+        {
+            case MEMORY_ORDER_ACQUIRE:
+                if (_subCaseId
+                    >= sizeof(acqTests) / sizeof(struct TestDefinition))
+                    return 0;
+                _subCase = acqTests[_subCaseId];
+                break;
+            case MEMORY_ORDER_RELEASE:
+                if (_subCaseId
+                    >= sizeof(relTests) / sizeof(struct TestDefinition))
+                    return 0;
+                _subCase = relTests[_subCaseId];
+                break;
+            case MEMORY_ORDER_ACQ_REL:
+                if (_subCaseId
+                    >= sizeof(arTests) / sizeof(struct TestDefinition))
+                    return 0;
+                _subCase = arTests[_subCaseId];
+                break;
+            case MEMORY_ORDER_SEQ_CST:
+                if (_subCaseId != 0) // one special case only
+                    return 0;
+                break;
+            default: return 0;
+        }
+        LocalRefValues(LocalMemory());
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems)
+    {
+        std::string header;
+        if (gOldAPI)
+        {
+            if (MemoryScope() == MEMORY_SCOPE_EMPTY)
+            {
+                header += "#define atomic_work_item_fence(x,y)                 "
+                          "       mem_fence(x)\n";
+            }
+            else
+            {
+                header += "#define atomic_work_item_fence(x,y,z)               "
+                          "       mem_fence(x)\n";
+            }
+        }
+        return header
+            + CBasicTestMemOrderScope<HostAtomicType, HostDataType>::
+                ProgramHeader(maxNumDestItems);
+    }
+    virtual std::string ProgramCore()
+    {
+        std::ostringstream naValues;
+        naValues << NumNonAtomicVariablesPerThread();
+        std::string program, fenceType, nonAtomic;
+        if (LocalMemory())
+        {
+            program = "  size_t myId = get_local_id(0), hisId = "
+                      "get_local_size(0)-1-myId;\n";
+            fenceType = "CLK_LOCAL_MEM_FENCE";
+            nonAtomic = "localValues";
+        }
+        else
+        {
+            program = "  size_t myId = tid, hisId = threadCount-1-tid;\n";
+            fenceType = "CLK_GLOBAL_MEM_FENCE";
+            nonAtomic = "oldValues";
+        }
+        if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
+        {
+            // All threads are divided into pairs.
+            // Each thread has its own atomic variable and performs the
+            // following actions:
+            // - increments its own variable
+            // - performs fence operation to propagate its value and to see
+            // value from other thread
+            // - reads value from other thread's variable
+            // - repeats the above steps when both values are the same (and less
+            // than 1000000)
+            // - stores the last value read from other thread (in additional
+            // variable) At the end of execution at least one thread should know
+            // the last value from other thread
+            program += std::string("") + "  " + DataType().RegularTypeName()
+                + " myValue = 0, hisValue; \n"
+                  "  do {\n"
+                  "    myValue++;\n"
+                  "    atomic_store_explicit(&destMemory[myId], myValue, "
+                  "memory_order_relaxed"
+                + MemoryScopeStr()
+                + ");\n"
+                  "    atomic_work_item_fence("
+                + fenceType + ", memory_order_seq_cst" + MemoryScopeStr()
+                + "); \n"
+                  "    hisValue = atomic_load_explicit(&destMemory[hisId], "
+                  "memory_order_relaxed"
+                + MemoryScopeStr()
+                + ");\n"
+                  "  } while(myValue == hisValue && myValue < 1000000);\n"
+                  "  "
+                + nonAtomic + "[myId] = hisValue; \n";
+        }
+        else
+        {
+            // Each thread modifies one of its non-atomic variables, increments
+            // value of its atomic variable and reads values from another thread
+            // in typical synchronizes-with scenario with:
+            // - non-atomic variable (at index A) modification (value change
+            // from 0 to A)
+            // - release operation (additional fence or within atomic) + atomic
+            // variable modification (value A)
+            // - atomic variable read (value B) + acquire operation (additional
+            // fence or within atomic)
+            // - non-atomic variable (at index B) read (value C)
+            // Each thread verifies dependency between atomic and non-atomic
+            // value read from another thread The following condition must be
+            // true: B == C
+            program += std::string("") + "  " + DataType().RegularTypeName()
+                + " myValue = 0, hisAtomicValue, hisValue; \n"
+                  "  do {\n"
+                  "    myValue++;\n"
+                  "    "
+                + nonAtomic + "[myId*" + naValues.str()
+                + "+myValue] = myValue;\n";
+            if (_subCase.op1IsFence)
+                program += std::string("") + "    atomic_work_item_fence("
+                    + fenceType + ", "
+                    + get_memory_order_type_name(_subCase.op1MemOrder)
+                    + MemoryScopeStr()
+                    + "); \n"
+                      "    atomic_store_explicit(&destMemory[myId], myValue, "
+                      "memory_order_relaxed"
+                    + MemoryScopeStr() + ");\n";
+            else
+                program += std::string("")
+                    + "    atomic_store_explicit(&destMemory[myId], myValue, "
+                    + get_memory_order_type_name(_subCase.op1MemOrder)
+                    + MemoryScopeStr() + ");\n";
+            if (_subCase.op2IsFence)
+                program += std::string("")
+                    + "    hisAtomicValue = "
+                      "atomic_load_explicit(&destMemory[hisId], "
+                      "memory_order_relaxed"
+                    + MemoryScopeStr()
+                    + ");\n"
+                      "    atomic_work_item_fence("
+                    + fenceType + ", "
+                    + get_memory_order_type_name(_subCase.op2MemOrder)
+                    + MemoryScopeStr() + "); \n";
+            else
+                program += std::string("")
+                    + "    hisAtomicValue = "
+                      "atomic_load_explicit(&destMemory[hisId], "
+                    + get_memory_order_type_name(_subCase.op2MemOrder)
+                    + MemoryScopeStr() + ");\n";
+            program += "    hisValue = " + nonAtomic + "[hisId*"
+                + naValues.str() + "+hisAtomicValue]; \n";
+            if (LocalMemory())
+                program += "    hisId = (hisId+1)%get_local_size(0);\n";
+            else
+                program += "    hisId = (hisId+1)%threadCount;\n";
+            program += "  } while(hisAtomicValue == hisValue && myValue < "
+                + naValues.str()
+                + "-1);\n"
+                  "  if(hisAtomicValue != hisValue)\n"
+                  "  { // fail\n"
+                  "    atomic_store(&destMemory[myId], myValue-1);\n";
+            if (LocalMemory())
+                program += "    hisId = "
+                           "(hisId+get_local_size(0)-1)%get_local_size(0);\n";
+            else
+                program += "    hisId = (hisId+threadCount-1)%threadCount;\n";
+            program += "    if(myValue+1 < " + naValues.str()
+                + ")\n"
+                  "      "
+                + nonAtomic + "[myId*" + naValues.str()
+                + "+myValue+1] = hisId;\n"
+                  "    if(myValue+2 < "
+                + naValues.str()
+                + ")\n"
+                  "      "
+                + nonAtomic + "[myId*" + naValues.str()
+                + "+myValue+2] = hisAtomicValue;\n"
+                  "    if(myValue+3 < "
+                + naValues.str()
+                + ")\n"
+                  "      "
+                + nonAtomic + "[myId*" + naValues.str()
+                + "+myValue+3] = hisValue;\n";
+            if (gDebug)
+            {
+                program += "    printf(\"WI %d: atomic value (%d) at index %d "
+                           "is different than non-atomic value (%d)\\n\", tid, "
+                           "hisAtomicValue, hisId, hisValue);\n";
+            }
+            program += "  }\n";
+        }
+        return program;
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        size_t myId = tid, hisId = threadCount - 1 - tid;
+        if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
+        {
+            HostDataType myValue = 0, hisValue;
+            // CPU thread typically starts faster - wait for GPU thread
+            myValue++;
+            host_atomic_store<HostAtomicType, HostDataType>(
+                &destMemory[myId], myValue, MEMORY_ORDER_SEQ_CST);
+            while (host_atomic_load<HostAtomicType, HostDataType>(
+                       &destMemory[hisId], MEMORY_ORDER_SEQ_CST)
+                   == 0)
+                ;
+            do
+            {
+                myValue++;
+                host_atomic_store<HostAtomicType, HostDataType>(
+                    &destMemory[myId], myValue, MEMORY_ORDER_RELAXED);
+                host_atomic_thread_fence(MemoryOrder());
+                hisValue = host_atomic_load<HostAtomicType, HostDataType>(
+                    &destMemory[hisId], MEMORY_ORDER_RELAXED);
+            } while (myValue == hisValue && hisValue < 1000000);
+            oldValues[tid] = hisValue;
+        }
+        else
+        {
+            HostDataType myValue = 0, hisAtomicValue, hisValue;
+            do
+            {
+                myValue++;
+                oldValues[myId * NumNonAtomicVariablesPerThread() + myValue] =
+                    myValue;
+                if (_subCase.op1IsFence)
+                {
+                    host_atomic_thread_fence(_subCase.op1MemOrder);
+                    host_atomic_store<HostAtomicType, HostDataType>(
+                        &destMemory[myId], myValue, MEMORY_ORDER_RELAXED);
+                }
+                else
+                    host_atomic_store<HostAtomicType, HostDataType>(
+                        &destMemory[myId], myValue, _subCase.op1MemOrder);
+                if (_subCase.op2IsFence)
+                {
+                    hisAtomicValue =
+                        host_atomic_load<HostAtomicType, HostDataType>(
+                            &destMemory[hisId], MEMORY_ORDER_RELAXED);
+                    host_atomic_thread_fence(_subCase.op2MemOrder);
+                }
+                else
+                    hisAtomicValue =
+                        host_atomic_load<HostAtomicType, HostDataType>(
+                            &destMemory[hisId], _subCase.op2MemOrder);
+                hisValue = oldValues[hisId * NumNonAtomicVariablesPerThread()
+                                     + hisAtomicValue];
+                hisId = (hisId + 1) % threadCount;
+            } while (hisAtomicValue == hisValue
+                     && myValue
+                         < (HostDataType)NumNonAtomicVariablesPerThread() - 1);
+            if (hisAtomicValue != hisValue)
+            { // fail
+                host_atomic_store<HostAtomicType, HostDataType>(
+                    &destMemory[myId], myValue - 1, MEMORY_ORDER_SEQ_CST);
+                if (gDebug)
+                {
+                    hisId = (hisId + threadCount - 1) % threadCount;
+                    printf("WI %d: atomic value (%d) at index %d is different "
+                           "than non-atomic value (%d)\n",
+                           tid, hisAtomicValue, hisId, hisValue);
+                }
+            }
+        }
+    }
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
+    {
+        for (cl_uint i = 0; i < threadCount * NumNonAtomicVariablesPerThread();
+             i++)
+            startRefValues[i] = 0;
+        return true;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
+    {
+        correct = true;
+        cl_uint workSize = LocalMemory() ? CurrentGroupSize() : threadCount;
+        for (cl_uint workOffset = 0; workOffset < threadCount;
+             workOffset += workSize)
+        {
+            if (workOffset + workSize > threadCount)
+                // last workgroup (host threads)
+                workSize = threadCount - workOffset;
+            for (cl_uint i = 0; i < workSize && workOffset + i < threadCount;
+                 i++)
+            {
+                HostAtomicType myValue = finalValues[workOffset + i];
+                if (MemoryOrder() == MEMORY_ORDER_SEQ_CST)
+                {
+                    HostDataType hisValue = refValues[workOffset + i];
+                    if (myValue == hisValue)
+                    {
+                        // a draw - both threads should reach final value
+                        // 1000000
+                        if (myValue != 1000000)
+                        {
+                            log_error("ERROR: Invalid reference value #%u (%d "
+                                      "instead of 1000000)\n",
+                                      workOffset + i, myValue);
+                            correct = false;
+                            return true;
+                        }
+                    }
+                    else
+                    {
+                        // slower thread (in total order of seq_cst operations)
+                        // must know last value written by faster thread
+                        HostAtomicType hisRealValue =
+                            finalValues[workOffset + workSize - 1 - i];
+                        HostDataType myValueReadByHim =
+                            refValues[workOffset + workSize - 1 - i];
+                        // who is the winner? - thread with lower private
+                        // counter value
+                        if (myValue == hisRealValue) // forbidden result - fence
+                                                     // doesn't work
+                        {
+                            log_error("ERROR: Atomic counter values #%u and "
+                                      "#%u are the same (%u)\n",
+                                      workOffset + i,
+                                      workOffset + workSize - 1 - i, myValue);
+                            log_error(
+                                "ERROR: Both threads have outdated values read "
+                                "from another thread (%u and %u)\n",
+                                hisValue, myValueReadByHim);
+                            correct = false;
+                            return true;
+                        }
+                        if (myValue > hisRealValue) // I'm slower
+                        {
+                            if (hisRealValue != hisValue)
+                            {
+                                log_error("ERROR: Invalid reference value #%u "
+                                          "(%d instead of %d)\n",
+                                          workOffset + i, hisValue,
+                                          hisRealValue);
+                                log_error(
+                                    "ERROR: Slower thread #%u should know "
+                                    "value written by faster thread #%u\n",
+                                    workOffset + i,
+                                    workOffset + workSize - 1 - i);
+                                correct = false;
+                                return true;
+                            }
+                        }
+                        else // I'm faster
+                        {
+                            if (myValueReadByHim != myValue)
+                            {
+                                log_error("ERROR: Invalid reference value #%u "
+                                          "(%d instead of %d)\n",
+                                          workOffset + workSize - 1 - i,
+                                          myValueReadByHim, myValue);
+                                log_error(
+                                    "ERROR: Slower thread #%u should know "
+                                    "value written by faster thread #%u\n",
+                                    workOffset + workSize - 1 - i,
+                                    workOffset + i);
+                                correct = false;
+                                return true;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (myValue != NumNonAtomicVariablesPerThread() - 1)
+                    {
+                        log_error("ERROR: Invalid atomic value #%u (%d instead "
+                                  "of %d)\n",
+                                  workOffset + i, myValue,
+                                  NumNonAtomicVariablesPerThread() - 1);
+                        log_error("ERROR: Thread #%u observed invalid values "
+                                  "in other thread's variables\n",
+                                  workOffset + i, myValue);
+                        correct = false;
+                        return true;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+    int _subCaseId;
+    struct TestDefinition _subCase;
+int test_atomic_fence_generic(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements,
+                              bool useSVM)
+    int error = 0;
+                                                        useSVM);
+    EXECUTE_TEST(error,
+                 test_int.Execute(deviceID, context, queue, num_elements));
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_uint.Execute(deviceID, context, queue, num_elements));
+                                                           useSVM);
+    EXECUTE_TEST(error,
+                 test_long.Execute(deviceID, context, queue, num_elements));
+                                                              useSVM);
+    EXECUTE_TEST(error,
+                 test_ulong.Execute(deviceID, context, queue, num_elements));
+    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
+    {
+        CBasicTestFence<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFence<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    else
+    {
+        CBasicTestFence<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(
+            TYPE_ATOMIC_INTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_intptr_t.Execute(deviceID, context, queue, num_elements));
+            test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_uintptr_t.Execute(deviceID, context, queue, num_elements));
+        CBasicTestFence<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(
+            TYPE_ATOMIC_SIZE_T, useSVM);
+        EXECUTE_TEST(
+            error, test_size_t.Execute(deviceID, context, queue, num_elements));
+            test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM);
+        EXECUTE_TEST(
+            error,
+            test_ptrdiff_t.Execute(deviceID, context, queue, num_elements));
+    }
+    return error;
+int test_atomic_fence(cl_device_id deviceID, cl_context context,
+                      cl_command_queue queue, int num_elements)
+    return test_atomic_fence_generic(deviceID, context, queue, num_elements,
+                                     false);
+int test_svm_atomic_fence(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+    return test_atomic_fence_generic(deviceID, context, queue, num_elements,
+                                     true);
diff --git a/test_conformance/commonfns/test_sign.cpp b/test_conformance/commonfns/test_sign.cpp
index 1b842e3..6dba58d 100644
--- a/test_conformance/commonfns/test_sign.cpp
+++ b/test_conformance/commonfns/test_sign.cpp
@@ -223,14 +223,13 @@
-  if(err)
-    return err;
+  if (err) return err;
-    if( ! is_extension_available( device, "cl_khr_fp64"))
-    {
-        log_info( "skipping double test -- cl_khr_fp64 not supported.\n" );
-        return 0;
-    }
+  if (!is_extension_available(device, "cl_khr_fp64"))
+  {
+      log_info("skipping double test -- cl_khr_fp64 not supported.\n");
+      return 0;
+  }
     return test_sign_double( device, context, queue, n_elems);
diff --git a/test_conformance/commonfns/test_step.cpp b/test_conformance/commonfns/test_step.cpp
index 0e3cfe0..330083b 100644
--- a/test_conformance/commonfns/test_step.cpp
+++ b/test_conformance/commonfns/test_step.cpp
@@ -158,23 +158,20 @@
     err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &step_kernel_code, "test_step" );
-    if (err)
-        return -1;
+    if (err) return -1;
     err = create_single_kernel_helper( context, &program[1], &kernel[1], 1, &step2_kernel_code, "test_step2" );
-    if (err)
-        return -1;
+    if (err) return -1;
     err = create_single_kernel_helper( context, &program[2], &kernel[2], 1, &step4_kernel_code, "test_step4" );
-    if (err)
-        return -1;
-  err = create_single_kernel_helper( context, &program[3], &kernel[3], 1, &step8_kernel_code, "test_step8" );
-  if (err)
-    return -1;
-  err = create_single_kernel_helper( context, &program[4], &kernel[4], 1, &step16_kernel_code, "test_step16" );
-  if (err)
-    return -1;
-  err = create_single_kernel_helper( context, &program[5], &kernel[5], 1, &step3_kernel_code, "test_step3" );
-  if (err)
-    return -1;
+    if (err) return -1;
+    err = create_single_kernel_helper(context, &program[3], &kernel[3], 1,
+                                      &step8_kernel_code, "test_step8");
+    if (err) return -1;
+    err = create_single_kernel_helper(context, &program[4], &kernel[4], 1,
+                                      &step16_kernel_code, "test_step16");
+    if (err) return -1;
+    err = create_single_kernel_helper(context, &program[5], &kernel[5], 1,
+                                      &step3_kernel_code, "test_step3");
+    if (err) return -1;
     values[0] = streams[0];
     values[1] = streams[1];
diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index 483adac..b95b0f5 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -20,7 +20,7 @@
 #include <unistd.h>
+// List should follow order in the extension spec
 const char *known_extensions[] = {
@@ -42,6 +42,7 @@
+    "cl_khr_extended_async_copies",
@@ -49,7 +50,9 @@
+    "cl_khr_extended_bit_ops",
+    "cl_khr_integer_dot_product",
+    "cl_khr_subgroup_rotate",
     // API-only extensions after this point.  If you add above here, modify
     // first_API_extension below.
@@ -71,10 +74,23 @@
+    "cl_khr_pci_bus_info",
+    "cl_khr_suggested_local_work_size",
+    "cl_khr_spirv_linkonce_odr",
+    "cl_khr_semaphore",
+    "cl_khr_external_semaphore",
+    "cl_khr_external_semaphore_win32",
+    "cl_khr_external_semaphore_sync_fd",
+    "cl_khr_external_semaphore_opaque_fd",
+    "cl_khr_external_memory",
+    "cl_khr_external_memory_win32",
+    "cl_khr_external_memory_opaque_fd",
+    "cl_khr_command_buffer",
+    "cl_khr_command_buffer_mutable_dispatch",
-size_t num_known_extensions = sizeof(known_extensions)/sizeof(char*);
-size_t first_API_extension = 27;
+size_t num_known_extensions = ARRAY_SIZE(known_extensions);
+size_t first_API_extension = 31;
 const char *known_embedded_extensions[] = {
@@ -314,8 +330,15 @@
     // Build the kernel
-    char *kernel_code = (char*)malloc(1025*256*(num_not_supported_extensions+num_of_supported_extensions));
-    memset(kernel_code, 0, 1025*256*(num_not_supported_extensions+num_of_supported_extensions));
+    char *kernel_code = (char *)malloc(
+        1
+        + 1025 * 256
+            * (num_not_supported_extensions + num_of_supported_extensions));
+    memset(
+        kernel_code, 0,
+        1
+            + 1025 * 256
+                * (num_not_supported_extensions + num_of_supported_extensions));
     int i, index = 0;
     strcat(kernel_code, kernel_strings[0]);
@@ -340,8 +363,6 @@
     clProgramWrapper program;
     clKernelWrapper kernel;
-    Version version = get_device_cl_version(device);
     error = create_single_kernel_helper(context, &program, &kernel, 1,
                                         (const char **)&kernel_code, "test");
     test_error(error, "create_single_kernel_helper failed");
diff --git a/test_conformance/compiler/test_feature_macro.cpp b/test_conformance/compiler/test_feature_macro.cpp
index ac355dd..ef3c002 100644
--- a/test_conformance/compiler/test_feature_macro.cpp
+++ b/test_conformance/compiler/test_feature_macro.cpp
@@ -579,6 +579,78 @@
                                         compiler_status, supported);
+int test_feature_macro_integer_dot_product_input_4x8bit_packed(
+    cl_device_id deviceID, cl_context context, std::string test_macro_name,
+    cl_bool& supported)
+    cl_int error = TEST_FAIL;
+    cl_bool api_status;
+    cl_bool compiler_status;
+    log_info("\n%s ...\n", test_macro_name.c_str());
+    if (!is_extension_available(deviceID, "cl_khr_integer_dot_product"))
+    {
+        supported = false;
+        return TEST_PASS;
+    }
+    error = check_api_feature_info_capabilities<
+        cl_device_integer_dot_product_capabilities_khr>(
+        deviceID, context, api_status,
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+    error = check_compiler_feature_info(deviceID, context, test_macro_name,
+                                        compiler_status);
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+    return feature_macro_verify_results(test_macro_name, api_status,
+                                        compiler_status, supported);
+int test_feature_macro_integer_dot_product_input_4x8bit(
+    cl_device_id deviceID, cl_context context, std::string test_macro_name,
+    cl_bool& supported)
+    cl_int error = TEST_FAIL;
+    cl_bool api_status;
+    cl_bool compiler_status;
+    log_info("\n%s ...\n", test_macro_name.c_str());
+    if (!is_extension_available(deviceID, "cl_khr_integer_dot_product"))
+    {
+        supported = false;
+        return TEST_PASS;
+    }
+    error = check_api_feature_info_capabilities<
+        cl_device_integer_dot_product_capabilities_khr>(
+        deviceID, context, api_status,
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+    error = check_compiler_feature_info(deviceID, context, test_macro_name,
+                                        compiler_status);
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+    return feature_macro_verify_results(test_macro_name, api_status,
+                                        compiler_status, supported);
 int test_feature_macro_int64(cl_device_id deviceID, cl_context context,
                              std::string test_macro_name, cl_bool& supported)
@@ -686,15 +758,6 @@
     sort(vec_to_cmp.begin(), vec_to_cmp.end());
     sort(vec_device_feature_names.begin(), vec_device_feature_names.end());
-    if (vec_device_feature_names == vec_to_cmp)
-    {
-        log_info("Comparison list of features - passed\n");
-    }
-    else
-    {
-        log_info("Comparison list of features - failed\n");
-        error = TEST_FAIL;
-    }
         "Supported features based on CL_DEVICE_OPENCL_C_FEATURES API query:\n");
     for (auto each_f : vec_device_feature_names)
@@ -703,11 +766,26 @@
     log_info("\nSupported features based on queries to API/compiler :\n");
     for (auto each_f : vec_to_cmp)
         log_info("%s\n", each_f.c_str());
+    for (auto each_f : vec_to_cmp)
+    {
+        if (find(vec_device_feature_names.begin(),
+                 vec_device_feature_names.end(), each_f)
+            == vec_device_feature_names.end())
+        {
+            log_info("Comparison list of features - failed - missing %s\n",
+                     each_f.c_str());
+            return TEST_FAIL;
+        }
+    }
+    log_info("Comparison list of features - passed\n");
     return error;
@@ -748,6 +826,8 @@
+    NEW_FEATURE_MACRO_TEST(integer_dot_product_input_4x8bit);
+    NEW_FEATURE_MACRO_TEST(integer_dot_product_input_4x8bit_packed);
     error |= test_consistency_c_features_list(deviceID, supported_features_vec);
diff --git a/test_conformance/computeinfo/CMakeLists.txt b/test_conformance/computeinfo/CMakeLists.txt
index 207223a..06f0599 100644
--- a/test_conformance/computeinfo/CMakeLists.txt
+++ b/test_conformance/computeinfo/CMakeLists.txt
@@ -5,6 +5,7 @@
+        pci_bus_info.cpp
diff --git a/test_conformance/computeinfo/device_uuid.cpp b/test_conformance/computeinfo/device_uuid.cpp
index 1ef9dad..7f29d0b 100644
--- a/test_conformance/computeinfo/device_uuid.cpp
+++ b/test_conformance/computeinfo/device_uuid.cpp
@@ -105,7 +105,7 @@
     if (!is_extension_available(deviceID, "cl_khr_device_uuid"))
         log_info("cl_khr_device_uuid not supported. Skipping test...\n");
-        return 0;
+        return TEST_SKIPPED_ITSELF;
     int total_errors = 0;
diff --git a/test_conformance/computeinfo/main.cpp b/test_conformance/computeinfo/main.cpp
index 4860b44..382cd6a 100644
--- a/test_conformance/computeinfo/main.cpp
+++ b/test_conformance/computeinfo/main.cpp
@@ -95,8 +95,8 @@
 struct _extensions
-    int cl_khr_fp64;
-    int cl_khr_fp16;
+    int has_cl_khr_fp64;
+    int has_cl_khr_fp16;
 typedef struct _extensions extensions_t;
@@ -908,12 +908,6 @@
                     cl_name_version new_version_item =
-                    cl_version new_version_major =
-                        CL_VERSION_MAJOR_KHR(new_version_item.version);
-                    cl_version new_version_minor =
-                        CL_VERSION_MINOR_KHR(new_version_item.version);
-                    cl_version new_version_patch =
-                        CL_VERSION_PATCH_KHR(new_version_item.version);
                     log_info("\t\t\"%s\" %d.%d.%d\n",,
@@ -1069,11 +1063,11 @@
         if (strncmp(begin, "cl_khr_fp64", length) == 0)
-            extensions->cl_khr_fp64 = 1;
+            extensions->has_cl_khr_fp64 = 1;
         if (strncmp(begin, "cl_khr_fp16", length) == 0)
-            extensions->cl_khr_fp16 = 1;
+            extensions->has_cl_khr_fp16 = 1;
         begin += length; // Skip word.
         if (begin[0] == ' ')
@@ -1112,13 +1106,13 @@
             // version 1.1, we have to check doubles are sopported. In
             // OpenCL 1.2 CL_DEVICE_DOUBLE_FP_CONFIG should be reported
             // unconditionally.
-            get = extensions.cl_khr_fp64;
+            get = extensions.has_cl_khr_fp64;
         if (info.opcode == CL_DEVICE_HALF_FP_CONFIG)
             // CL_DEVICE_HALF_FP_CONFIG should be reported only when cl_khr_fp16
             // extension is available
-            get = extensions.cl_khr_fp16;
+            get = extensions.has_cl_khr_fp16;
         if (get)
@@ -1421,15 +1415,16 @@
 extern int test_extended_versioning(cl_device_id, cl_context, cl_command_queue,
 extern int test_device_uuid(cl_device_id, cl_context, cl_command_queue, int);
 extern int test_conformance_version(cl_device_id, cl_context, cl_command_queue,
+extern int test_pci_bus_info(cl_device_id, cl_context, cl_command_queue, int);
 test_definition test_list[] = {
     ADD_TEST_VERSION(conformance_version, Version(3, 0)),
+    ADD_TEST(pci_bus_info),
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/computeinfo/pci_bus_info.cpp b/test_conformance/computeinfo/pci_bus_info.cpp
new file mode 100644
index 0000000..cd62ca0
--- /dev/null
+++ b/test_conformance/computeinfo/pci_bus_info.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2021 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "harness/compat.h"
+#include <array>
+#include <bitset>
+#include "harness/testHarness.h"
+#include "harness/deviceInfo.h"
+int test_pci_bus_info(cl_device_id deviceID, cl_context context,
+                      cl_command_queue ignoreQueue, int num_elements)
+    if (!is_extension_available(deviceID, "cl_khr_pci_bus_info"))
+    {
+        log_info("cl_khr_pci_bus_info not supported. Skipping test...\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_int error;
+    cl_device_pci_bus_info_khr info;
+    size_t size_ret;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, 0, NULL,
+                            &size_ret);
+    test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR size");
+    test_assert_error(
+        size_ret == sizeof(info),
+        "Query for CL_DEVICE_PCI_BUS_INFO_KHR returned an unexpected size");
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, sizeof(info),
+                            &info, NULL);
+    test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR");
+    log_info("\tPCI Bus Info: %04x:%02x:%02x.%x\n", info.pci_domain,
+             info.pci_bus, info.pci_device, info.pci_function);
+    return TEST_PASS;
diff --git a/test_conformance/contractions/contractions.cpp b/test_conformance/contractions/contractions.cpp
index dddebb4..474fd36 100644
--- a/test_conformance/contractions/contractions.cpp
+++ b/test_conformance/contractions/contractions.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -434,7 +434,6 @@
-    vlog( "\n\nTest binary built %s %s\n", __DATE__, __TIME__ );
diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index 3299884..3ee072d 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -696,7 +696,8 @@
 static void int2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_int*) in)[0]; }
 static void int2float( void *out, void *in)
-    cl_int l = ((cl_int*) in)[0];
+    // Use volatile to prevent optimization by Clang compiler
+    volatile cl_int l = ((cl_int *)in)[0];
     ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
 static void int2double( void *out, void *in)
diff --git a/test_conformance/conversions/fplib.cpp b/test_conformance/conversions/fplib.cpp
index e739b9a..3b19b56 100644
--- a/test_conformance/conversions/fplib.cpp
+++ b/test_conformance/conversions/fplib.cpp
@@ -79,7 +79,6 @@
             uint32_t mantissa;
             if (mantShift >= 0){
                 uint64_t temp = (uint64_t)data >> mantShift;
-                uint64_t mask = (1 << mantShift) - 1;
                 if ((temp << mantShift) != data)
                     inExact = 1;
                 mantissa = (uint32_t)temp;
@@ -124,7 +123,6 @@
             uint32_t    mantissa;
             if (mantShift >= 0){
                 uint64_t temp = (uint64_t)data >> mantShift;
-                uint64_t mask = (1 << mantShift) - 1;
                 if (temp << mantShift != data)
                     inExact = 1;
                 mantissa = (uint32_t)temp;
@@ -183,7 +181,6 @@
             uint32_t    mantissa;
             if (mantShift >= 0){
                 uint64_t temp = data >> mantShift;
-                uint64_t mask = (1 << mantShift) - 1;
                 if (temp << mantShift != data)
                     inExact = 1;
                 mantissa = (uint32_t)temp;
@@ -209,7 +206,6 @@
             uint32_t  mantissa;
             if (mantShift >= 0){
                 uint64_t temp = (uint64_t)data >> mantShift;
-                uint64_t mask = (1 << mantShift) - 1;
                 if (temp << mantShift != data)
                     inExact = 1;
                 mantissa = (uint32_t)temp;
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 87b8ead..2b18b92 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -38,6 +38,7 @@
 #include <sys/param.h>
+#include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
@@ -47,6 +48,8 @@
 #include <time.h>
+#include <algorithm>
 #include "Sleep.h"
 #include "basic_test_conversions.h"
@@ -340,7 +343,7 @@
 static int ParseArgs( int argc, const char **argv )
     int i;
-    argList = (const char **)calloc( argc - 1, sizeof( char*) );
+    argList = (const char **)calloc(argc, sizeof(char *));
     argCount = 0;
     if( NULL == argList && argc > 1 )
@@ -481,8 +484,6 @@
     vlog( "\n" );
-    vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );
     if( gWimpyMode )
@@ -1003,7 +1004,8 @@
     uint64_t i;
-    size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
+    size_t blockCount =
+        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
     size_t step = blockCount;
     uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
     cl_event writeInputBuffer = NULL;
@@ -1078,7 +1080,7 @@
-        cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
+        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         writeInputBufferInfo.count = count;
         // Crate a user event to represent the status of the reference value computation completion
@@ -1556,84 +1558,40 @@
     cl_program program;
     char testName[256];
     int error = 0;
-    const char **strings;
-    size_t stringCount = 0;
+    std::ostringstream source;
+    if (outType == kdouble || inType == kdouble)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
     // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
     if (0 == vectorSize)
+        // Create the type names.
         char inName[32];
         char outName[32];
-        const char *programSource[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   dest[i] =  src[i];\n"
-            "}\n"
-        };
-        stringCount = sizeof(programSource) / sizeof(programSource[0]);
-        strings = programSource;
-        if (outType == kdouble || inType == kdouble)
-            programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-        //create the type name
         strncpy(inName, gTypeNames[inType], sizeof(inName));
         strncpy(outName, gTypeNames[outType], sizeof(outName));
         sprintf(testName, "test_implicit_%s_%s", outName, inName);
-        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]);
+        source << "__kernel void " << testName << "( __global " << inName
+               << " *src, __global " << outName << " *dest )\n";
+        source << "{\n";
+        source << "   size_t i = get_global_id(0);\n";
+        source << "   dest[i] =  src[i];\n";
+        source << "}\n";
+        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
+             gTypeNames[outType]);
         int vectorSizetmp = vectorSizes[vectorSize];
+        // Create the type names.
         char convertString[128];
         char inName[32];
         char outName[32];
-        const char *programSource[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   dest[i] = ", convertString, "( src[i] );\n"
-            "}\n"
-        };
-        const char *programSourceV3[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   if( i + 1 < get_global_size(0))\n"
-            "       vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
-            "   else\n"
-            "   {\n"
-            "       ", inName, "3 in;\n"
-            "       ", outName, "3 out;\n"
-            "       if( 0 == (i & 1) )\n"
-            "           in.y = src[3*i+1];\n"
-            "       in.x = src[3*i];\n"
-            "       out = ", convertString, "( in ); \n"
-            "       dest[3*i] = out.x;\n"
-            "       if( 0 == (i & 1) )\n"
-            "           dest[3*i+1] = out.y;\n"
-            "   }\n"
-            "}\n"
-        };
-        stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) :
-            sizeof(programSource) / sizeof(programSource[0]);
-        strings = 3 == vectorSizetmp ? programSourceV3 : programSource;
-        if (outType == kdouble || inType == kdouble) {
-            programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-            programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-        }
-        //create the type name
         switch (vectorSizetmp)
         case 1:
@@ -1658,8 +1616,40 @@
             vlog("Building %s( %s ) test\n", convertString, inName);
+        if (vectorSizetmp == 3)
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   if( i + 1 < get_global_size(0))\n";
+            source << "       vstore3( " << convertString
+                   << "( vload3( i, src)), i, dest );\n";
+            source << "   else\n";
+            source << "   {\n";
+            source << "       " << inName << "3 in;\n";
+            source << "       " << outName << "3 out;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           in.y = src[3*i+1];\n";
+            source << "       in.x = src[3*i];\n";
+            source << "       out = " << convertString << "( in ); \n";
+            source << "       dest[3*i] = out.x;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           dest[3*i+1] = out.y;\n";
+            source << "   }\n";
+            source << "}\n";
+        }
+        else
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   dest[i] = " << convertString << "( src[i] );\n";
+            source << "}\n";
+        }
     *outKernel = NULL;
@@ -1668,11 +1658,12 @@
         flags = "-cl-denorms-are-zero";
     // build it
-    error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags);
+    std::string sourceString = source.str();
+    const char *programSource = sourceString.c_str();
+    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
+                                        &programSource, testName, flags);
     if (error)
-        char    buffer[2048] = "";
         vlog_error("Failed to build kernel/program.\n", error);
         return NULL;
diff --git a/test_conformance/device_execution/enqueue_ndrange.cpp b/test_conformance/device_execution/enqueue_ndrange.cpp
index 8ced662..f228f06 100644
--- a/test_conformance/device_execution/enqueue_ndrange.cpp
+++ b/test_conformance/device_execution/enqueue_ndrange.cpp
@@ -18,6 +18,7 @@
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
+#include <algorithm>
 #include <vector>
 #include "procs.h"
@@ -645,7 +646,7 @@
     max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
-        max_local_size = MIN(8, max_local_size);
+        max_local_size = std::min((size_t)8, max_local_size);
     cl_uint num = 10;
diff --git a/test_conformance/device_execution/host_queue_order.cpp b/test_conformance/device_execution/host_queue_order.cpp
index 2b5688d..5376ea4 100644
--- a/test_conformance/device_execution/host_queue_order.cpp
+++ b/test_conformance/device_execution/host_queue_order.cpp
@@ -18,6 +18,7 @@
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
+#include <algorithm>
 #include <vector>
 #include "procs.h"
@@ -124,7 +125,7 @@
     cl_uint num = arr_size(result);
     if( gWimpyMode )
-        num = MAX(num / 16, 4);
+        num = std::max(num / 16, 4U);
     clMemWrapper res_mem;
diff --git a/test_conformance/events/action_classes.cpp b/test_conformance/events/action_classes.cpp
index d70d76b..a84be6b 100644
--- a/test_conformance/events/action_classes.cpp
+++ b/test_conformance/events/action_classes.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,7 +19,8 @@
 const cl_uint BufferSizeReductionFactor = 20;
-cl_int    Action::IGetPreferredImageSize2D( cl_device_id device, size_t &outWidth, size_t &outHeight )
+cl_int Action::IGetPreferredImageSize2D(cl_device_id device, size_t &outWidth,
+                                        size_t &outHeight)
     cl_ulong maxAllocSize;
     size_t maxWidth, maxHeight;
@@ -27,23 +28,27 @@
     // Get the largest possible buffer we could allocate
-    error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
-    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxWidth ), &maxWidth, NULL );
-    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxHeight ), &maxHeight, NULL );
-    test_error( error, "Unable to get device config" );
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                             sizeof(maxWidth), &maxWidth, NULL);
+    error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                             sizeof(maxHeight), &maxHeight, NULL);
+    test_error(error, "Unable to get device config");
     // Create something of a decent size
-    if( maxWidth * maxHeight * 4 > maxAllocSize / BufferSizeReductionFactor )
+    if (maxWidth * maxHeight * 4 > maxAllocSize / BufferSizeReductionFactor)
-        float rootSize = sqrtf( (float)( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) );
+        float rootSize =
+            sqrtf((float)(maxAllocSize / (BufferSizeReductionFactor * 4)));
-        if( (size_t)rootSize > maxWidth )
+        if ((size_t)rootSize > maxWidth)
             outWidth = maxWidth;
             outWidth = (size_t)rootSize;
-        outHeight = (size_t)( ( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) / outWidth );
-        if( outHeight > maxHeight )
-            outHeight = maxHeight;
+        outHeight = (size_t)((maxAllocSize / (BufferSizeReductionFactor * 4))
+                             / outWidth);
+        if (outHeight > maxHeight) outHeight = maxHeight;
@@ -51,19 +56,18 @@
         outHeight = maxHeight;
-    outWidth /=2;
-    outHeight /=2;
+    outWidth /= 2;
+    outHeight /= 2;
-    if (outWidth > 2048)
-        outWidth = 2048;
-    if (outHeight > 2048)
-        outHeight = 2048;
+    if (outWidth > 2048) outWidth = 2048;
+    if (outHeight > 2048) outHeight = 2048;
     log_info("\tImage size: %d x %d (%gMB)\n", (int)outWidth, (int)outHeight,
-             (double)((int)outWidth*(int)outHeight*4)/(1024.0*1024.0));
+             (double)((int)outWidth * (int)outHeight * 4) / (1024.0 * 1024.0));
     return CL_SUCCESS;
-cl_int    Action::IGetPreferredImageSize3D( cl_device_id device, size_t &outWidth, size_t &outHeight, size_t &outDepth )
+cl_int Action::IGetPreferredImageSize3D(cl_device_id device, size_t &outWidth,
+                                        size_t &outHeight, size_t &outDepth)
     cl_ulong maxAllocSize;
     size_t maxWidth, maxHeight, maxDepth;
@@ -71,28 +75,34 @@
     // Get the largest possible buffer we could allocate
-    error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
-    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof( maxWidth ), &maxWidth, NULL );
-    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof( maxHeight ), &maxHeight, NULL );
-    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof( maxDepth ), &maxDepth, NULL );
-    test_error( error, "Unable to get device config" );
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH,
+                             sizeof(maxWidth), &maxWidth, NULL);
+    error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+                             sizeof(maxHeight), &maxHeight, NULL);
+    error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH,
+                             sizeof(maxDepth), &maxDepth, NULL);
+    test_error(error, "Unable to get device config");
     // Create something of a decent size
-    if( (cl_ulong)maxWidth * maxHeight * maxDepth > maxAllocSize / ( BufferSizeReductionFactor * 4 ) )
+    if ((cl_ulong)maxWidth * maxHeight * maxDepth
+        > maxAllocSize / (BufferSizeReductionFactor * 4))
-        float rootSize = cbrtf( (float)( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) );
+        float rootSize =
+            cbrtf((float)(maxAllocSize / (BufferSizeReductionFactor * 4)));
-        if( (size_t)rootSize > maxWidth )
+        if ((size_t)rootSize > maxWidth)
             outWidth = maxWidth;
             outWidth = (size_t)rootSize;
-        if( (size_t)rootSize > maxHeight )
+        if ((size_t)rootSize > maxHeight)
             outHeight = maxHeight;
             outHeight = (size_t)rootSize;
-        outDepth = (size_t)( ( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) / ( outWidth * outHeight ) );
-        if( outDepth > maxDepth )
-            outDepth = maxDepth;
+        outDepth = (size_t)((maxAllocSize / (BufferSizeReductionFactor * 4))
+                            / (outWidth * outHeight));
+        if (outDepth > maxDepth) outDepth = maxDepth;
@@ -101,25 +111,25 @@
         outDepth = maxDepth;
-    outWidth /=2;
-    outHeight /=2;
-    outDepth /=2;
+    outWidth /= 2;
+    outHeight /= 2;
+    outDepth /= 2;
-    if (outWidth > 512)
-        outWidth = 512;
-    if (outHeight > 512)
-        outHeight = 512;
-    if (outDepth > 512)
-        outDepth = 512;
-    log_info("\tImage size: %d x %d x %d (%gMB)\n", (int)outWidth, (int)outHeight, (int)outDepth,
-             (double)((int)outWidth*(int)outHeight*(int)outDepth*4)/(1024.0*1024.0));
+    if (outWidth > 512) outWidth = 512;
+    if (outHeight > 512) outHeight = 512;
+    if (outDepth > 512) outDepth = 512;
+    log_info("\tImage size: %d x %d x %d (%gMB)\n", (int)outWidth,
+             (int)outHeight, (int)outDepth,
+             (double)((int)outWidth * (int)outHeight * (int)outDepth * 4)
+                 / (1024.0 * 1024.0));
     return CL_SUCCESS;
 #pragma mark -------------------- Execution Sub-Classes -------------------------
-cl_int NDRangeKernelAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int NDRangeKernelAction::Setup(cl_device_id device, cl_context context,
+                                  cl_command_queue queue)
     const char *long_kernel[] = {
         "__kernel void sample_test(__global float *src, __global int *dst)\n"
@@ -132,101 +142,116 @@
         "        dst[tid] = (int)src[tid] * 3;\n"
         "    }\n"
-        "}\n" };
+        "}\n"
+    };
     size_t threads[1] = { 1000 };
     int error;
-    if( create_single_kernel_helper( context, &mProgram, &mKernel, 1, long_kernel, "sample_test" ) )
+    if (create_single_kernel_helper(context, &mProgram, &mKernel, 1,
+                                    long_kernel, "sample_test"))
         return -1;
-    error = get_max_common_work_group_size( context, mKernel, threads[0], &mLocalThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error = get_max_common_work_group_size(context, mKernel, threads[0],
+                                           &mLocalThreads[0]);
+    test_error(error, "Unable to get work group size to use");
     mStreams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                  sizeof(cl_float) * 1000, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     mStreams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                  sizeof(cl_int) * 1000, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     /* Set the arguments */
-    error = clSetKernelArg( mKernel, 0, sizeof( mStreams[0] ), &mStreams[0] );
-    test_error( error, "Unable to set kernel arguments" );
-    error = clSetKernelArg( mKernel, 1, sizeof( mStreams[1] ), &mStreams[1] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(mKernel, 0, sizeof(mStreams[0]), &mStreams[0]);
+    test_error(error, "Unable to set kernel arguments");
+    error = clSetKernelArg(mKernel, 1, sizeof(mStreams[1]), &mStreams[1]);
+    test_error(error, "Unable to set kernel arguments");
     return CL_SUCCESS;
-cl_int    NDRangeKernelAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int NDRangeKernelAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                    cl_event *waits, cl_event *outEvent)
     size_t threads[1] = { 1000 };
-    cl_int error = clEnqueueNDRangeKernel( queue, mKernel, 1, NULL, threads, mLocalThreads, numWaits, waits, outEvent );
-    test_error( error, "Unable to execute kernel" );
+    cl_int error =
+        clEnqueueNDRangeKernel(queue, mKernel, 1, NULL, threads, mLocalThreads,
+                               numWaits, waits, outEvent);
+    test_error(error, "Unable to execute kernel");
     return CL_SUCCESS;
 #pragma mark -------------------- Buffer Sub-Classes -------------------------
-cl_int BufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue, bool allocate )
+cl_int BufferAction::Setup(cl_device_id device, cl_context context,
+                           cl_command_queue queue, bool allocate)
     cl_int error;
     cl_ulong maxAllocSize;
     // Get the largest possible buffer we could allocate
-    error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
-    // Don't create a buffer quite that big, just so we have some space left over for other work
-    mSize = (size_t)( maxAllocSize / BufferSizeReductionFactor );
+    // Don't create a buffer quite that big, just so we have some space left
+    // over for other work
+    mSize = (size_t)(maxAllocSize / BufferSizeReductionFactor);
     // Cap at 128M so tests complete in a reasonable amount of time.
-    if (mSize > 128 << 20)
-        mSize = 128 << 20;
+    if (mSize > 128 << 20) mSize = 128 << 20;
-    mSize /=2;
+    mSize /= 2;
-    log_info("\tBuffer size: %gMB\n", (double)mSize/(1024.0*1024.0));
+    log_info("\tBuffer size: %gMB\n", (double)mSize / (1024.0 * 1024.0));
-    mBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, mSize, NULL, &error );
-    test_error( error, "Unable to create buffer to test against" );
+    mBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                             mSize, NULL, &error);
+    test_error(error, "Unable to create buffer to test against");
-    mOutBuffer = malloc( mSize );
-    if( mOutBuffer == NULL )
+    mOutBuffer = malloc(mSize);
+    if (mOutBuffer == NULL)
-        log_error( "ERROR: Unable to allocate temp buffer (out of memory)\n" );
+        log_error("ERROR: Unable to allocate temp buffer (out of memory)\n");
         return CL_OUT_OF_RESOURCES;
     return CL_SUCCESS;
-cl_int ReadBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int ReadBufferAction::Setup(cl_device_id device, cl_context context,
+                               cl_command_queue queue)
-    return BufferAction::Setup( device, context, queue, true );
+    return BufferAction::Setup(device, context, queue, true);
-cl_int    ReadBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int ReadBufferAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                 cl_event *waits, cl_event *outEvent)
-    cl_int error = clEnqueueReadBuffer( queue, mBuffer, CL_FALSE, 0, mSize, mOutBuffer, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue buffer read" );
+    cl_int error = clEnqueueReadBuffer(queue, mBuffer, CL_FALSE, 0, mSize,
+                                       mOutBuffer, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue buffer read");
     return CL_SUCCESS;
-cl_int WriteBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int WriteBufferAction::Setup(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
-    return BufferAction::Setup( device, context, queue, true );
+    return BufferAction::Setup(device, context, queue, true);
-cl_int WriteBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int WriteBufferAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                  cl_event *waits, cl_event *outEvent)
-    cl_int error = clEnqueueWriteBuffer( queue, mBuffer, CL_FALSE, 0, mSize, mOutBuffer, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue buffer write" );
+    cl_int error = clEnqueueWriteBuffer(queue, mBuffer, CL_FALSE, 0, mSize,
+                                        mOutBuffer, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue buffer write");
     return CL_SUCCESS;
@@ -234,40 +259,46 @@
     if (mQueue)
-        clEnqueueUnmapMemObject( mQueue, mBuffer, mMappedPtr, 0, NULL, NULL );
+        clEnqueueUnmapMemObject(mQueue, mBuffer, mMappedPtr, 0, NULL, NULL);
-cl_int MapBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int MapBufferAction::Setup(cl_device_id device, cl_context context,
+                              cl_command_queue queue)
-    return BufferAction::Setup( device, context, queue, false );
+    return BufferAction::Setup(device, context, queue, false);
-cl_int MapBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int MapBufferAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                cl_event *waits, cl_event *outEvent)
     cl_int error;
     mQueue = queue;
-    mMappedPtr = clEnqueueMapBuffer( queue, mBuffer, CL_FALSE, CL_MAP_READ, 0, mSize, numWaits, waits, outEvent, &error );
-    test_error( error, "Unable to enqueue buffer map" );
+    mMappedPtr = clEnqueueMapBuffer(queue, mBuffer, CL_FALSE, CL_MAP_READ, 0,
+                                    mSize, numWaits, waits, outEvent, &error);
+    test_error(error, "Unable to enqueue buffer map");
     return CL_SUCCESS;
-cl_int UnmapBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int UnmapBufferAction::Setup(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
-    cl_int error = BufferAction::Setup( device, context, queue, false );
-    if( error != CL_SUCCESS )
-        return error;
+    cl_int error = BufferAction::Setup(device, context, queue, false);
+    if (error != CL_SUCCESS) return error;
-    mMappedPtr = clEnqueueMapBuffer( queue, mBuffer, CL_TRUE, CL_MAP_READ, 0, mSize, 0, NULL, NULL, &error );
-    test_error( error, "Unable to enqueue buffer map" );
+    mMappedPtr = clEnqueueMapBuffer(queue, mBuffer, CL_TRUE, CL_MAP_READ, 0,
+                                    mSize, 0, NULL, NULL, &error);
+    test_error(error, "Unable to enqueue buffer map");
     return CL_SUCCESS;
-cl_int UnmapBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int UnmapBufferAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                  cl_event *waits, cl_event *outEvent)
-    cl_int error = clEnqueueUnmapMemObject( queue, mBuffer, mMappedPtr, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue buffer unmap" );
+    cl_int error = clEnqueueUnmapMemObject(queue, mBuffer, mMappedPtr, numWaits,
+                                           waits, outEvent);
+    test_error(error, "Unable to enqueue buffer unmap");
     return CL_SUCCESS;
@@ -275,349 +306,410 @@
 #pragma mark -------------------- Read/Write Image Classes -------------------------
-cl_int ReadImage2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int ReadImage2DAction::Setup(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
+    mImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                             mHeight, 0, NULL, &error);
-    test_error( error, "Unable to create image to test against" );
+    test_error(error, "Unable to create image to test against");
-    mOutput = malloc( mWidth * mHeight * 4 );
-    if( mOutput == NULL )
+    mOutput = malloc(mWidth * mHeight * 4);
+    if (mOutput == NULL)
-        log_error( "ERROR: Unable to allocate buffer: out of memory\n" );
+        log_error("ERROR: Unable to allocate buffer: out of memory\n");
         return CL_OUT_OF_RESOURCES;
     return CL_SUCCESS;
-cl_int ReadImage2DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int ReadImage2DAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                  cl_event *waits, cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 };
-    cl_int error = clEnqueueReadImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image read" );
+    cl_int error = clEnqueueReadImage(queue, mImage, CL_FALSE, origin, region,
+                                      0, 0, mOutput, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image read");
     return CL_SUCCESS;
-cl_int ReadImage3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int ReadImage3DAction::Setup(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                             mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mOutput = malloc( mWidth * mHeight * mDepth * 4 );
-    if( mOutput == NULL )
+    mOutput = malloc(mWidth * mHeight * mDepth * 4);
+    if (mOutput == NULL)
-        log_error( "ERROR: Unable to allocate buffer: out of memory\n" );
+        log_error("ERROR: Unable to allocate buffer: out of memory\n");
         return CL_OUT_OF_RESOURCES;
     return CL_SUCCESS;
-cl_int ReadImage3DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int ReadImage3DAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                  cl_event *waits, cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth };
-    cl_int error = clEnqueueReadImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image read" );
+    cl_int error = clEnqueueReadImage(queue, mImage, CL_FALSE, origin, region,
+                                      0, 0, mOutput, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image read");
     return CL_SUCCESS;
-cl_int WriteImage2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int WriteImage2DAction::Setup(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth,
+                             mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mOutput = malloc( mWidth * mHeight * 4 );
-    if( mOutput == NULL )
+    mOutput = malloc(mWidth * mHeight * 4);
+    if (mOutput == NULL)
-        log_error( "ERROR: Unable to allocate buffer: out of memory\n" );
+        log_error("ERROR: Unable to allocate buffer: out of memory\n");
         return CL_OUT_OF_RESOURCES;
     return CL_SUCCESS;
-cl_int WriteImage2DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int WriteImage2DAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                   cl_event *waits, cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 };
-    cl_int error = clEnqueueWriteImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image write" );
+    cl_int error =
+        clEnqueueWriteImage(queue, mImage, CL_FALSE, origin, region, 0, 0,
+                            mOutput, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image write");
     return CL_SUCCESS;
-cl_int WriteImage3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int WriteImage3DAction::Setup(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                             mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mOutput = malloc( mWidth * mHeight * mDepth * 4 );
-    if( mOutput == NULL )
+    mOutput = malloc(mWidth * mHeight * mDepth * 4);
+    if (mOutput == NULL)
-        log_error( "ERROR: Unable to allocate buffer: out of memory\n" );
+        log_error("ERROR: Unable to allocate buffer: out of memory\n");
         return CL_OUT_OF_RESOURCES;
     return CL_SUCCESS;
-cl_int WriteImage3DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int WriteImage3DAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                   cl_event *waits, cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth };
-    cl_int error = clEnqueueWriteImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image write" );
+    cl_int error =
+        clEnqueueWriteImage(queue, mImage, CL_FALSE, origin, region, 0, 0,
+                            mOutput, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image write");
     return CL_SUCCESS;
 #pragma mark -------------------- Copy Image Classes -------------------------
-cl_int CopyImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int CopyImageAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                                cl_event *waits, cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth };
-    cl_int error = clEnqueueCopyImage( queue, mSrcImage, mDstImage, origin, origin, region, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image copy" );
+    cl_int error =
+        clEnqueueCopyImage(queue, mSrcImage, mDstImage, origin, origin, region,
+                           numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image copy");
     return CL_SUCCESS;
-cl_int CopyImage2Dto2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyImage2Dto2DAction::Setup(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     mWidth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     mDepth = 1;
     return CL_SUCCESS;
-cl_int CopyImage2Dto3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyImage2Dto3DAction::Setup(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     mDepth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     mDepth = 1;
     return CL_SUCCESS;
-cl_int CopyImage3Dto2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyImage3Dto2DAction::Setup(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     mDepth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     mDepth = 1;
     return CL_SUCCESS;
-cl_int CopyImage3Dto3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyImage3Dto3DAction::Setup(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     mDepth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     return CL_SUCCESS;
 #pragma mark -------------------- Copy Image/Buffer Classes -------------------------
-cl_int Copy2DImageToBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int Copy2DImageToBufferAction::Setup(cl_device_id device, cl_context context,
+                                        cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     mWidth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, mWidth * mHeight * 4, NULL, &error );
-    test_error( error, "Unable to create buffer to test against" );
+    mDstBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                mWidth * mHeight * 4, NULL, &error);
+    test_error(error, "Unable to create buffer to test against");
     return CL_SUCCESS;
-cl_int Copy2DImageToBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int Copy2DImageToBufferAction::Execute(cl_command_queue queue,
+                                          cl_uint numWaits, cl_event *waits,
+                                          cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 };
-    cl_int error = clEnqueueCopyImageToBuffer( queue, mSrcImage, mDstBuffer, origin, region, 0, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image to buffer copy" );
+    cl_int error =
+        clEnqueueCopyImageToBuffer(queue, mSrcImage, mDstBuffer, origin, region,
+                                   0, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image to buffer copy");
     return CL_SUCCESS;
-cl_int Copy3DImageToBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int Copy3DImageToBufferAction::Setup(cl_device_id device, cl_context context,
+                                        cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     mDepth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
-    mDstBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, mWidth * mHeight * mDepth * 4, NULL, &error );
-    test_error( error, "Unable to create buffer to test against" );
+    mDstBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                mWidth * mHeight * mDepth * 4, NULL, &error);
+    test_error(error, "Unable to create buffer to test against");
     return CL_SUCCESS;
-cl_int Copy3DImageToBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int Copy3DImageToBufferAction::Execute(cl_command_queue queue,
+                                          cl_uint numWaits, cl_event *waits,
+                                          cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth };
-    cl_int error = clEnqueueCopyImageToBuffer( queue, mSrcImage, mDstBuffer, origin, region, 0, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue image to buffer copy" );
+    cl_int error =
+        clEnqueueCopyImageToBuffer(queue, mSrcImage, mDstBuffer, origin, region,
+                                   0, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue image to buffer copy");
     return CL_SUCCESS;
-cl_int CopyBufferTo2DImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyBufferTo2DImageAction::Setup(cl_device_id device, cl_context context,
+                                        cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     mWidth /= 2;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mSrcBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, mWidth * mHeight * 4, NULL, &error );
-    test_error( error, "Unable to create buffer to test against" );
+    mSrcBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, mWidth * mHeight * 4,
+                                NULL, &error);
+    test_error(error, "Unable to create buffer to test against");
-    mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth,
+                                mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     return CL_SUCCESS;
-cl_int CopyBufferTo2DImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int CopyBufferTo2DImageAction::Execute(cl_command_queue queue,
+                                          cl_uint numWaits, cl_event *waits,
+                                          cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 };
-    cl_int error = clEnqueueCopyBufferToImage( queue, mSrcBuffer, mDstImage, 0, origin, region, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue buffer to image copy" );
+    cl_int error =
+        clEnqueueCopyBufferToImage(queue, mSrcBuffer, mDstImage, 0, origin,
+                                   region, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue buffer to image copy");
     return CL_SUCCESS;
-cl_int CopyBufferTo3DImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int CopyBufferTo3DImageAction::Setup(cl_device_id device, cl_context context,
+                                        cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) )
+    if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth)))
         return error;
     mDepth /= 2;
-    mSrcBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, mWidth * mHeight * mDepth * 4, NULL, &error );
-    test_error( error, "Unable to create buffer to test against" );
+    mSrcBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY,
+                                mWidth * mHeight * mDepth * 4, NULL, &error);
+    test_error(error, "Unable to create buffer to test against");
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth,
+                                mHeight, mDepth, 0, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     return CL_SUCCESS;
-cl_int CopyBufferTo3DImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int CopyBufferTo3DImageAction::Execute(cl_command_queue queue,
+                                          cl_uint numWaits, cl_event *waits,
+                                          cl_event *outEvent)
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth };
-    cl_int error = clEnqueueCopyBufferToImage( queue, mSrcBuffer, mDstImage, 0, origin, region, numWaits, waits, outEvent );
-    test_error( error, "Unable to enqueue buffer to image copy" );
+    cl_int error =
+        clEnqueueCopyBufferToImage(queue, mSrcBuffer, mDstImage, 0, origin,
+                                   region, numWaits, waits, outEvent);
+    test_error(error, "Unable to enqueue buffer to image copy");
     return CL_SUCCESS;
@@ -627,34 +719,39 @@
     if (mQueue)
-        clEnqueueUnmapMemObject( mQueue, mImage, mMappedPtr, 0, NULL, NULL );
+        clEnqueueUnmapMemObject(mQueue, mImage, mMappedPtr, 0, NULL, NULL);
-cl_int MapImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue )
+cl_int MapImageAction::Setup(cl_device_id device, cl_context context,
+                             cl_command_queue queue)
     cl_int error;
-    if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) )
+    if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight)))
         return error;
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
-    mImage = create_image_2d( context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, &format, mWidth, mHeight, 0, NULL, &error );
-    test_error( error, "Unable to create image to test against" );
+    mImage = create_image_2d(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                             &format, mWidth, mHeight, 0, NULL, &error);
+    test_error(error, "Unable to create image to test against");
     return CL_SUCCESS;
-cl_int MapImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent )
+cl_int MapImageAction::Execute(cl_command_queue queue, cl_uint numWaits,
+                               cl_event *waits, cl_event *outEvent)
     cl_int error;
-    size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 };
+    size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 };
     size_t outPitch;
     mQueue = queue;
-    mMappedPtr = clEnqueueMapImage( queue, mImage, CL_FALSE, CL_MAP_READ, origin, region, &outPitch, NULL, numWaits, waits, outEvent, &error );
-    test_error( error, "Unable to enqueue image map" );
+    mMappedPtr =
+        clEnqueueMapImage(queue, mImage, CL_FALSE, CL_MAP_READ, origin, region,
+                          &outPitch, NULL, numWaits, waits, outEvent, &error);
+    test_error(error, "Unable to enqueue image map");
     return CL_SUCCESS;
diff --git a/test_conformance/events/action_classes.h b/test_conformance/events/action_classes.h
index 069ed34..e528f11 100644
--- a/test_conformance/events/action_classes.h
+++ b/test_conformance/events/action_classes.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,303 +23,319 @@
 // it would potentially be possible for an implementation to make actions
 // wait on one another based on their shared I/O, not because of their
 // wait lists!
-class Action
-    public:
-        Action() {}
-        virtual ~Action() {}
+class Action {
+    Action() {}
+    virtual ~Action() {}
-        virtual cl_int        Setup( cl_device_id device, cl_context context, cl_command_queue queue ) = 0;
-        virtual cl_int        Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) = 0;
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue) = 0;
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent) = 0;
-        virtual const char * GetName( void ) const = 0;
+    virtual const char *GetName(void) const = 0;
-    protected:
-        cl_int    IGetPreferredImageSize2D( cl_device_id device, size_t &outWidth, size_t &outHeight );
-        cl_int    IGetPreferredImageSize3D( cl_device_id device, size_t &outWidth, size_t &outHeight, size_t &outDepth );
+    cl_int IGetPreferredImageSize2D(cl_device_id device, size_t &outWidth,
+                                    size_t &outHeight);
+    cl_int IGetPreferredImageSize3D(cl_device_id device, size_t &outWidth,
+                                    size_t &outHeight, size_t &outDepth);
 // Simple NDRangeKernel execution that takes a noticable amount of time
-class NDRangeKernelAction : public Action
-    public:
-        NDRangeKernelAction() {}
-        virtual ~NDRangeKernelAction() {}
+class NDRangeKernelAction : public Action {
+    NDRangeKernelAction() {}
+    virtual ~NDRangeKernelAction() {}
-        size_t                mLocalThreads[ 1 ];
-        clMemWrapper        mStreams[ 2 ];
-        clProgramWrapper    mProgram;
-        clKernelWrapper        mKernel;
+    size_t mLocalThreads[1];
+    clMemWrapper mStreams[2];
+    clProgramWrapper mProgram;
+    clKernelWrapper mKernel;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "NDRangeKernel"; }
+    virtual const char *GetName(void) const { return "NDRangeKernel"; }
 // Base action for buffer actions
-class BufferAction : public Action
-    public:
-        clMemWrapper        mBuffer;
-        size_t                mSize;
-        void                *mOutBuffer;
+class BufferAction : public Action {
+    clMemWrapper mBuffer;
+    size_t mSize;
+    void *mOutBuffer;
-        BufferAction() { mOutBuffer = NULL; }
-        virtual ~BufferAction() { free( mOutBuffer ); }
+    BufferAction() { mOutBuffer = NULL; }
+    virtual ~BufferAction() { free(mOutBuffer); }
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue, bool allocate );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue, bool allocate);
-class ReadBufferAction : public BufferAction
-    public:
-        ReadBufferAction() {}
-        virtual ~ReadBufferAction() {}
+class ReadBufferAction : public BufferAction {
+    ReadBufferAction() {}
+    virtual ~ReadBufferAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "ReadBuffer"; }
+    virtual const char *GetName(void) const { return "ReadBuffer"; }
-class WriteBufferAction : public BufferAction
-    public:
-        WriteBufferAction() {}
-        virtual ~WriteBufferAction() {}
+class WriteBufferAction : public BufferAction {
+    WriteBufferAction() {}
+    virtual ~WriteBufferAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "WriteBuffer"; }
+    virtual const char *GetName(void) const { return "WriteBuffer"; }
-class MapBufferAction : public BufferAction
-    public:
-        MapBufferAction() : mQueue(0) {}
+class MapBufferAction : public BufferAction {
+    MapBufferAction(): mQueue(0) {}
-        cl_command_queue    mQueue;
-        void                *mMappedPtr;
+    cl_command_queue mQueue;
+    void *mMappedPtr;
-        virtual ~MapBufferAction();
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual ~MapBufferAction();
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "MapBuffer"; }
+    virtual const char *GetName(void) const { return "MapBuffer"; }
-class UnmapBufferAction : public BufferAction
-    public:
-        UnmapBufferAction() {}
-        virtual ~UnmapBufferAction() {}
+class UnmapBufferAction : public BufferAction {
+    UnmapBufferAction() {}
+    virtual ~UnmapBufferAction() {}
-        void                *mMappedPtr;
+    void *mMappedPtr;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "UnmapBuffer"; }
+    virtual const char *GetName(void) const { return "UnmapBuffer"; }
-class ReadImage2DAction : public Action
-    public:
-        ReadImage2DAction() { mOutput = NULL; }
-        virtual ~ReadImage2DAction() { free( mOutput ); }
+class ReadImage2DAction : public Action {
+    ReadImage2DAction() { mOutput = NULL; }
+    virtual ~ReadImage2DAction() { free(mOutput); }
-        clMemWrapper        mImage;
-        size_t                mWidth, mHeight;
-        void                *mOutput;
+    clMemWrapper mImage;
+    size_t mWidth, mHeight;
+    void *mOutput;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "ReadImage2D"; }
+    virtual const char *GetName(void) const { return "ReadImage2D"; }
-class ReadImage3DAction : public Action
-    public:
-        ReadImage3DAction() { mOutput = NULL; }
-        virtual ~ReadImage3DAction() { free( mOutput ); }
+class ReadImage3DAction : public Action {
+    ReadImage3DAction() { mOutput = NULL; }
+    virtual ~ReadImage3DAction() { free(mOutput); }
-        clMemWrapper        mImage;
-        size_t                mWidth, mHeight, mDepth;
-        void                *mOutput;
+    clMemWrapper mImage;
+    size_t mWidth, mHeight, mDepth;
+    void *mOutput;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "ReadImage3D"; }
+    virtual const char *GetName(void) const { return "ReadImage3D"; }
-class WriteImage2DAction : public Action
-    public:
-        clMemWrapper        mImage;
-        size_t                mWidth, mHeight;
-        void                *mOutput;
+class WriteImage2DAction : public Action {
+    clMemWrapper mImage;
+    size_t mWidth, mHeight;
+    void *mOutput;
-        WriteImage2DAction() { mOutput = NULL; }
-        virtual ~WriteImage2DAction() { free( mOutput ); }
+    WriteImage2DAction() { mOutput = NULL; }
+    virtual ~WriteImage2DAction() { free(mOutput); }
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "WriteImage2D"; }
+    virtual const char *GetName(void) const { return "WriteImage2D"; }
-class WriteImage3DAction : public Action
-    public:
-        clMemWrapper        mImage;
-        size_t                mWidth, mHeight, mDepth;
-        void                *mOutput;
+class WriteImage3DAction : public Action {
+    clMemWrapper mImage;
+    size_t mWidth, mHeight, mDepth;
+    void *mOutput;
-        WriteImage3DAction() { mOutput = NULL; }
-        virtual ~WriteImage3DAction() { free( mOutput ); }
+    WriteImage3DAction() { mOutput = NULL; }
+    virtual ~WriteImage3DAction() { free(mOutput); }
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "WriteImage3D"; }
+    virtual const char *GetName(void) const { return "WriteImage3D"; }
-class CopyImageAction : public Action
-    public:
-        CopyImageAction() {}
-        virtual ~CopyImageAction() {}
+class CopyImageAction : public Action {
+    CopyImageAction() {}
+    virtual ~CopyImageAction() {}
-        clMemWrapper        mSrcImage, mDstImage;
-        size_t                mWidth, mHeight, mDepth;
+    clMemWrapper mSrcImage, mDstImage;
+    size_t mWidth, mHeight, mDepth;
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-class CopyImage2Dto2DAction : public CopyImageAction
-    public:
-        CopyImage2Dto2DAction() {}
-        virtual ~CopyImage2Dto2DAction() {}
+class CopyImage2Dto2DAction : public CopyImageAction {
+    CopyImage2Dto2DAction() {}
+    virtual ~CopyImage2Dto2DAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
-        virtual const char * GetName( void ) const { return "CopyImage2Dto2D"; }
+    virtual const char *GetName(void) const { return "CopyImage2Dto2D"; }
-class CopyImage2Dto3DAction : public CopyImageAction
-    public:
-        CopyImage2Dto3DAction() {}
-        virtual ~CopyImage2Dto3DAction() {}
+class CopyImage2Dto3DAction : public CopyImageAction {
+    CopyImage2Dto3DAction() {}
+    virtual ~CopyImage2Dto3DAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
-        virtual const char * GetName( void ) const { return "CopyImage2Dto3D"; }
+    virtual const char *GetName(void) const { return "CopyImage2Dto3D"; }
-class CopyImage3Dto2DAction : public CopyImageAction
-    public:
-        CopyImage3Dto2DAction() {}
-        virtual ~CopyImage3Dto2DAction() {}
+class CopyImage3Dto2DAction : public CopyImageAction {
+    CopyImage3Dto2DAction() {}
+    virtual ~CopyImage3Dto2DAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
-        virtual const char * GetName( void ) const { return "CopyImage3Dto2D"; }
+    virtual const char *GetName(void) const { return "CopyImage3Dto2D"; }
-class CopyImage3Dto3DAction : public CopyImageAction
-    public:
-        CopyImage3Dto3DAction() {}
-        virtual ~CopyImage3Dto3DAction() {}
+class CopyImage3Dto3DAction : public CopyImageAction {
+    CopyImage3Dto3DAction() {}
+    virtual ~CopyImage3Dto3DAction() {}
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
-        virtual const char * GetName( void ) const { return "CopyImage3Dto3D"; }
+    virtual const char *GetName(void) const { return "CopyImage3Dto3D"; }
-class Copy2DImageToBufferAction : public Action
-    public:
-        Copy2DImageToBufferAction() {}
-        virtual ~Copy2DImageToBufferAction() {}
+class Copy2DImageToBufferAction : public Action {
+    Copy2DImageToBufferAction() {}
+    virtual ~Copy2DImageToBufferAction() {}
-        clMemWrapper        mSrcImage, mDstBuffer;
-        size_t                mWidth, mHeight;
+    clMemWrapper mSrcImage, mDstBuffer;
+    size_t mWidth, mHeight;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "Copy2DImageToBuffer"; }
+    virtual const char *GetName(void) const { return "Copy2DImageToBuffer"; }
-class Copy3DImageToBufferAction : public Action
-    public:
-        Copy3DImageToBufferAction() {}
-        virtual ~Copy3DImageToBufferAction() {}
+class Copy3DImageToBufferAction : public Action {
+    Copy3DImageToBufferAction() {}
+    virtual ~Copy3DImageToBufferAction() {}
-        clMemWrapper        mSrcImage, mDstBuffer;
-        size_t                mWidth, mHeight, mDepth;
+    clMemWrapper mSrcImage, mDstBuffer;
+    size_t mWidth, mHeight, mDepth;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "Copy3DImageToBuffer"; }
+    virtual const char *GetName(void) const { return "Copy3DImageToBuffer"; }
-class CopyBufferTo2DImageAction : public Action
-    public:
-        CopyBufferTo2DImageAction() {}
-        virtual ~CopyBufferTo2DImageAction() {}
+class CopyBufferTo2DImageAction : public Action {
+    CopyBufferTo2DImageAction() {}
+    virtual ~CopyBufferTo2DImageAction() {}
-        clMemWrapper        mSrcBuffer, mDstImage;
-        size_t                mWidth, mHeight;
+    clMemWrapper mSrcBuffer, mDstImage;
+    size_t mWidth, mHeight;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "CopyBufferTo2D"; }
+    virtual const char *GetName(void) const { return "CopyBufferTo2D"; }
-class CopyBufferTo3DImageAction : public Action
-    public:
-        CopyBufferTo3DImageAction() {}
-        virtual ~CopyBufferTo3DImageAction() {}
+class CopyBufferTo3DImageAction : public Action {
+    CopyBufferTo3DImageAction() {}
+    virtual ~CopyBufferTo3DImageAction() {}
-        clMemWrapper        mSrcBuffer, mDstImage;
-        size_t                mWidth, mHeight, mDepth;
+    clMemWrapper mSrcBuffer, mDstImage;
+    size_t mWidth, mHeight, mDepth;
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "CopyBufferTo3D"; }
+    virtual const char *GetName(void) const { return "CopyBufferTo3D"; }
-class MapImageAction : public Action
-    public:
-        MapImageAction() : mQueue(0) {}
+class MapImageAction : public Action {
+    MapImageAction(): mQueue(0) {}
-        clMemWrapper        mImage;
-        size_t                mWidth, mHeight;
-        void                *mMappedPtr;
-        cl_command_queue    mQueue;
+    clMemWrapper mImage;
+    size_t mWidth, mHeight;
+    void *mMappedPtr;
+    cl_command_queue mQueue;
-        virtual ~MapImageAction();
-        virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue );
-        virtual cl_int    Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent );
+    virtual ~MapImageAction();
+    virtual cl_int Setup(cl_device_id device, cl_context context,
+                         cl_command_queue queue);
+    virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits,
+                           cl_event *waits, cl_event *outEvent);
-        virtual const char * GetName( void ) const { return "MapImage"; }
+    virtual const char *GetName(void) const { return "MapImage"; }
diff --git a/test_conformance/events/main.cpp b/test_conformance/events/main.cpp
index 777d2d3..74682f9 100644
--- a/test_conformance/events/main.cpp
+++ b/test_conformance/events/main.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -24,44 +24,44 @@
 test_definition test_list[] = {
-    ADD_TEST( event_get_execute_status ),
-    ADD_TEST( event_get_write_array_status ),
-    ADD_TEST( event_get_read_array_status ),
-    ADD_TEST( event_get_info ),
-    ADD_TEST( event_wait_for_execute ),
-    ADD_TEST( event_wait_for_array ),
-    ADD_TEST( event_flush ),
-    ADD_TEST( event_finish_execute ),
-    ADD_TEST( event_finish_array ),
-    ADD_TEST( event_release_before_done ),
-    ADD_TEST( event_enqueue_marker ),
+    ADD_TEST(event_get_execute_status),
+    ADD_TEST(event_get_write_array_status),
+    ADD_TEST(event_get_read_array_status),
+    ADD_TEST(event_get_info),
+    ADD_TEST(event_wait_for_execute),
+    ADD_TEST(event_wait_for_array),
+    ADD_TEST(event_flush),
+    ADD_TEST(event_finish_execute),
+    ADD_TEST(event_finish_array),
+    ADD_TEST(event_release_before_done),
+    ADD_TEST(event_enqueue_marker),
 #ifdef CL_VERSION_1_2
-    ADD_TEST( event_enqueue_marker_with_event_list ),
-    ADD_TEST( event_enqueue_barrier_with_event_list ),
+    ADD_TEST(event_enqueue_marker_with_event_list),
+    ADD_TEST(event_enqueue_barrier_with_event_list),
-    ADD_TEST( out_of_order_event_waitlist_single_queue ),
-    ADD_TEST( out_of_order_event_waitlist_multi_queue ),
-    ADD_TEST( out_of_order_event_waitlist_multi_queue_multi_device ),
-    ADD_TEST( out_of_order_event_enqueue_wait_for_events_single_queue ),
-    ADD_TEST( out_of_order_event_enqueue_wait_for_events_multi_queue ),
-    ADD_TEST( out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device ),
-    ADD_TEST( out_of_order_event_enqueue_marker_single_queue ),
-    ADD_TEST( out_of_order_event_enqueue_marker_multi_queue ),
-    ADD_TEST( out_of_order_event_enqueue_marker_multi_queue_multi_device ),
-    ADD_TEST( out_of_order_event_enqueue_barrier_single_queue ),
+    ADD_TEST(out_of_order_event_waitlist_single_queue),
+    ADD_TEST(out_of_order_event_waitlist_multi_queue),
+    ADD_TEST(out_of_order_event_waitlist_multi_queue_multi_device),
+    ADD_TEST(out_of_order_event_enqueue_wait_for_events_single_queue),
+    ADD_TEST(out_of_order_event_enqueue_wait_for_events_multi_queue),
+    ADD_TEST(
+        out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device),
+    ADD_TEST(out_of_order_event_enqueue_marker_single_queue),
+    ADD_TEST(out_of_order_event_enqueue_marker_multi_queue),
+    ADD_TEST(out_of_order_event_enqueue_marker_multi_queue_multi_device),
+    ADD_TEST(out_of_order_event_enqueue_barrier_single_queue),
-    ADD_TEST( waitlists ),
-    ADD_TEST( userevents ),
-    ADD_TEST( callbacks ),
-    ADD_TEST( callbacks_simultaneous ),
-    ADD_TEST( userevents_multithreaded ),
+    ADD_TEST(waitlists),
+    ADD_TEST(userevents),
+    ADD_TEST(callbacks),
+    ADD_TEST(callbacks_simultaneous),
+    ADD_TEST(userevents_multithreaded),
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 int main(int argc, const char *argv[])
     return runTestHarness(argc, argv, test_num, test_list, false, 0);
diff --git a/test_conformance/events/procs.h b/test_conformance/events/procs.h
index f077c24..97309db 100644
--- a/test_conformance/events/procs.h
+++ b/test_conformance/events/procs.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,44 +18,101 @@
 #include "harness/typeWrappers.h"
 #include "harness/clImageHelper.h"
-extern float    random_float(float low, float high);
-extern float    calculate_ulperror(float a, float b);
+extern float random_float(float low, float high);
+extern float calculate_ulperror(float a, float b);
-extern int        test_event_get_execute_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_get_write_array_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_get_read_array_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_get_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_wait_for_execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_wait_for_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_flush(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_finish_execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_finish_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_release_before_done(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_enqueue_marker(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_event_get_execute_status(cl_device_id deviceID,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements);
+extern int test_event_get_write_array_status(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_event_get_read_array_status(cl_device_id deviceID,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements);
+extern int test_event_get_info(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_event_wait_for_execute(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_event_wait_for_array(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements);
+extern int test_event_flush(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements);
+extern int test_event_finish_execute(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements);
+extern int test_event_finish_array(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements);
+extern int test_event_release_before_done(cl_device_id deviceID,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);
+extern int test_event_enqueue_marker(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements);
 #ifdef CL_VERSION_1_2
-extern int        test_event_enqueue_marker_with_event_list(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_event_enqueue_barrier_with_event_list(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_event_enqueue_marker_with_event_list(cl_device_id deviceID,
+                                                     cl_context context,
+                                                     cl_command_queue queue,
+                                                     int num_elements);
+extern int test_event_enqueue_barrier_with_event_list(cl_device_id deviceID,
+                                                      cl_context context,
+                                                      cl_command_queue queue,
+                                                      int num_elements);
-extern int        test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_waitlist_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_waitlist_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID,
+                                                         cl_context context,
+                                                         cl_command_queue queue,
+                                                         int num_elements);
+extern int test_out_of_order_event_waitlist_multi_queue(cl_device_id deviceID,
+                                                        cl_context context,
+                                                        cl_command_queue queue,
+                                                        int num_elements);
+extern int test_out_of_order_event_waitlist_multi_queue_multi_device(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
-extern int        test_out_of_order_event_enqueue_wait_for_events_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_enqueue_wait_for_events_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_out_of_order_event_enqueue_wait_for_events_single_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_out_of_order_event_enqueue_wait_for_events_multi_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
-extern int        test_out_of_order_event_enqueue_barrier_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_out_of_order_event_enqueue_barrier_single_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
-extern int        test_out_of_order_event_enqueue_marker_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_enqueue_marker_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int        test_out_of_order_event_enqueue_marker_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_out_of_order_event_enqueue_marker_single_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_out_of_order_event_enqueue_marker_multi_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_out_of_order_event_enqueue_marker_multi_queue_multi_device(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
-extern int        test_waitlists( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
-extern int        test_userevents( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
-extern int        test_callbacks( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
-extern int        test_callbacks_simultaneous( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
-extern int        test_userevents_multithreaded( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int test_waitlists(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements);
+extern int test_userevents(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements);
+extern int test_callbacks(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements);
+extern int test_callbacks_simultaneous(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_userevents_multithreaded(cl_device_id deviceID,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements);
diff --git a/test_conformance/events/testBase.h b/test_conformance/events/testBase.h
index 5b49bfd..63086d7 100644
--- a/test_conformance/events/testBase.h
+++ b/test_conformance/events/testBase.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -26,6 +26,3 @@
 #include "procs.h"
 #endif // _testBase_h
diff --git a/test_conformance/events/test_callbacks.cpp b/test_conformance/events/test_callbacks.cpp
index 2ffb9ca..04481de 100644
--- a/test_conformance/events/test_callbacks.cpp
+++ b/test_conformance/events/test_callbacks.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,28 +18,34 @@
 #include "harness/conversions.h"
 #include "harness/ThreadPool.h"
-#if !defined (_MSC_VER)
+#if !defined(_MSC_VER)
 #include <unistd.h>
 #endif // !_MSC_VER
-extern const char *IGetStatusString( cl_int status );
+extern const char *IGetStatusString(cl_int status);
 #define PRINT_OPS 0
-// Yes, this is somewhat nasty, in that we're relying on the CPU (the real CPU, not the OpenCL device)
-// to be atomic w.r.t. boolean values. Although if it isn't, we'll just miss the check on this bool
-// until the next time around, so it's not that big of a deal. Ideally, we'd be using a semaphore with
-// a trywait on it, but then that introduces the fun issue of what to do on Win32, etc. This way is
-// far more portable, and worst case of failure is a slightly longer test run.
+// Yes, this is somewhat nasty, in that we're relying on the CPU (the real CPU,
+// not the OpenCL device) to be atomic w.r.t. boolean values. Although if it
+// isn't, we'll just miss the check on this bool until the next time around, so
+// it's not that big of a deal. Ideally, we'd be using a semaphore with a
+// trywait on it, but then that introduces the fun issue of what to do on Win32,
+// etc. This way is far more portable, and worst case of failure is a slightly
+// longer test run.
 static bool sCallbackTriggered = false;
-static bool sCallbackTriggered_flag[ EVENT_CALLBACK_TYPE_TOTAL ] ={ false,false, false };
+static bool sCallbackTriggered_flag[EVENT_CALLBACK_TYPE_TOTAL] = { false, false,
+                                                                   false };
+cl_int event_callback_types[EVENT_CALLBACK_TYPE_TOTAL] = { CL_SUBMITTED,
+                                                           CL_RUNNING,
+                                                           CL_COMPLETE };
 // Our callback function
-/*void CL_CALLBACK single_event_callback_function( cl_event event, cl_int commandStatus, void * userData )
+/*void CL_CALLBACK single_event_callback_function( cl_event event, cl_int
+commandStatus, void * userData )
      int i=*static_cast<int *>(userData);
     log_info( "\tEvent callback  %d   triggered\n",  i);
@@ -47,295 +53,322 @@
 /*   use struct as call back para */
-typedef struct { cl_int enevt_type; int index; } CALL_BACK_USER_DATA;
-void CL_CALLBACK single_event_callback_function_flags( cl_event event, cl_int commandStatus, void * userData )
+typedef struct
-   // int i=*static_cast<int *>(userData);
-    CALL_BACK_USER_DATA *pdata= static_cast<CALL_BACK_USER_DATA *>(userData);
+    cl_int event_type;
+    int index;
-    log_info( "\tEvent callback  %d  of type %d triggered\n",  pdata->index, pdata->enevt_type);
-    sCallbackTriggered_flag [pdata->index ] = true;
+void CL_CALLBACK single_event_callback_function_flags(cl_event event,
+                                                      cl_int commandStatus,
+                                                      void *userData)
+    // int i=*static_cast<int *>(userData);
+    CALL_BACK_USER_DATA *pdata = static_cast<CALL_BACK_USER_DATA *>(userData);
+    log_info("\tEvent callback  %d  of type %d triggered\n", pdata->index,
+             pdata->event_type);
+    sCallbackTriggered_flag[pdata->index] = true;
-int test_callback_event_single( cl_device_id device, cl_context context, cl_command_queue queue, Action *actionToTest )
+int test_callback_event_single(cl_device_id device, cl_context context,
+                               cl_command_queue queue, Action *actionToTest)
-    // Note: we don't use the waiting feature here. We just want to verify that we get a callback called
-    // when the given event finishes
+    // Note: we don't use the waiting feature here. We just want to verify that
+    // we get a callback called when the given event finishes
-    cl_int error = actionToTest->Setup( device, context, queue );
-    test_error( error, "Unable to set up test action" );
+    cl_int error = actionToTest->Setup(device, context, queue);
+    test_error(error, "Unable to set up test action");
     // Set up a user event, which we use as a gate for the second event
-    clEventWrapper gateEvent = clCreateUserEvent( context, &error );
-    test_error( error, "Unable to set up user gate event" );
+    clEventWrapper gateEvent = clCreateUserEvent(context, &error);
+    test_error(error, "Unable to set up user gate event");
     // Set up the execution of the action with its actual event
     clEventWrapper actualEvent;
-    error = actionToTest->Execute( queue, 1, &gateEvent, &actualEvent );
-    test_error( error, "Unable to set up action execution" );
+    error = actionToTest->Execute(queue, 1, &gateEvent, &actualEvent);
+    test_error(error, "Unable to set up action execution");
     // Set up the callback on the actual event
-  /*  use struct as call back para */
-  int index [EVENT_CALLBACK_TYPE_TOTAL]={ 0,1,2};
-  for( int i=0;i< EVENT_CALLBACK_TYPE_TOTAL; i++)
-  {
-       user_data[i].enevt_type=event_callback_types[i];
-       user_data[i].index =i;
-       error = clSetEventCallback( actualEvent, event_callback_types[i], single_event_callback_function_flags, user_data+i );
-  }
+    /*  use struct as call back para */
+    for (int i = 0; i < EVENT_CALLBACK_TYPE_TOTAL; i++)
+    {
+        user_data[i].event_type = event_callback_types[i];
+        user_data[i].index = i;
+        error = clSetEventCallback(actualEvent, event_callback_types[i],
+                                   single_event_callback_function_flags,
+                                   user_data + i);
+    }
     // Now release the user event, which will allow our actual action to run
-    error = clSetUserEventStatus( gateEvent, CL_COMPLETE );
-    test_error( error, "Unable to trigger gate event" );
+    error = clSetUserEventStatus(gateEvent, CL_COMPLETE);
+    test_error(error, "Unable to trigger gate event");
-    // Now we wait for completion. Note that we can actually wait on the event itself, at least at first
-    error = clWaitForEvents( 1, &actualEvent );
-    test_error( error, "Unable to wait for actual test event" );
+    // Now we wait for completion. Note that we can actually wait on the event
+    // itself, at least at first
+    error = clWaitForEvents(1, &actualEvent);
+    test_error(error, "Unable to wait for actual test event");
-    // Note: we can check our callback now, and it MIGHT have been triggered, but that's not guaranteed
-    if( sCallbackTriggered )
+    // Note: we can check our callback now, and it MIGHT have been triggered,
+    // but that's not guaranteed
+    if (sCallbackTriggered)
         // We're all good, so return success
         return 0;
-    // The callback has not yet been called, but that doesn't mean it won't be. So wait for it
-    log_info( "\tWaiting for callback..." );
-    fflush( stdout );
-    for( int i = 0; i < 10 * 10; i++ )
+    // The callback has not yet been called, but that doesn't mean it won't be.
+    // So wait for it
+    log_info("\tWaiting for callback...");
+    fflush(stdout);
+    for (int i = 0; i < 10 * 10; i++)
-        usleep( 100000 );    // 1/10th second
+        usleep(100000); // 1/10th second
-    int cc=0;
-    for( int k=0;k< EVENT_CALLBACK_TYPE_TOTAL;k++)
-        if (sCallbackTriggered_flag[k]) {
-            cc++;
-        }
+        int cc = 0;
+        for (int k = 0; k < EVENT_CALLBACK_TYPE_TOTAL; k++)
+            if (sCallbackTriggered_flag[k])
+            {
+                cc++;
+            }
-        if  (cc== EVENT_CALLBACK_TYPE_TOTAL  )
+        if (cc == EVENT_CALLBACK_TYPE_TOTAL)
-            log_info( "\n" );
+            log_info("\n");
             return 0;
-        log_info( "." );
-        fflush( stdout );
+        log_info(".");
+        fflush(stdout);
     // If we got here, we never got the callback
-    log_error( "\nCallback not called within 10 seconds! (assuming failure)\n" );
+    log_error("\nCallback not called within 10 seconds! (assuming failure)\n");
     return -1;
-#define TEST_ACTION( name ) \
-{    \
-    name##Action action;    \
-    log_info( "-- Testing " #name "...\n" );    \
-    if( ( error = test_callback_event_single( deviceID, context, queue, &action ) ) != CL_SUCCESS )    \
-        retVal++;            \
-    clFinish( queue ); \
+#define TEST_ACTION(name)                                                      \
+    {                                                                          \
+        name##Action action;                                                   \
+        log_info("-- Testing " #name "...\n");                                 \
+        if ((error = test_callback_event_single(deviceID, context, queue,      \
+                                                &action))                      \
+            != CL_SUCCESS)                                                     \
+            retVal++;                                                          \
+        clFinish(queue);                                                       \
+    }
-int test_callbacks( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_callbacks(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
     cl_int error;
     int retVal = 0;
-    log_info( "\n" );
+    log_info("\n");
-    TEST_ACTION( NDRangeKernel )
+    TEST_ACTION(NDRangeKernel)
-    TEST_ACTION( ReadBuffer )
-    TEST_ACTION( WriteBuffer )
-    TEST_ACTION( MapBuffer )
-    TEST_ACTION( UnmapBuffer )
+    TEST_ACTION(ReadBuffer)
+    TEST_ACTION(WriteBuffer)
+    TEST_ACTION(MapBuffer)
+    TEST_ACTION(UnmapBuffer)
-    if( checkForImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED )
+    if (checkForImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
-        log_info( "\nNote: device does not support images. Skipping remainder of callback tests...\n" );
+        log_info("\nNote: device does not support images. Skipping remainder "
+                 "of callback tests...\n");
-        TEST_ACTION( ReadImage2D )
-        TEST_ACTION( WriteImage2D )
-        TEST_ACTION( CopyImage2Dto2D )
-        TEST_ACTION( Copy2DImageToBuffer )
-        TEST_ACTION( CopyBufferTo2DImage )
-        TEST_ACTION( MapImage )
+        TEST_ACTION(ReadImage2D)
+        TEST_ACTION(WriteImage2D)
+        TEST_ACTION(CopyImage2Dto2D)
+        TEST_ACTION(Copy2DImageToBuffer)
+        TEST_ACTION(CopyBufferTo2DImage)
+        TEST_ACTION(MapImage)
-        if( checkFor3DImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED )
-            log_info( "\nNote: device does not support 3D images. Skipping remainder of waitlist tests...\n" );
+        if (checkFor3DImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
+            log_info("\nNote: device does not support 3D images. Skipping "
+                     "remainder of waitlist tests...\n");
-            TEST_ACTION( ReadImage3D )
-            TEST_ACTION( WriteImage3D )
-            TEST_ACTION( CopyImage2Dto3D )
-            TEST_ACTION( CopyImage3Dto2D )
-            TEST_ACTION( CopyImage3Dto3D )
-            TEST_ACTION( Copy3DImageToBuffer )
-            TEST_ACTION( CopyBufferTo3DImage )
+            TEST_ACTION(ReadImage3D)
+            TEST_ACTION(WriteImage3D)
+            TEST_ACTION(CopyImage2Dto3D)
+            TEST_ACTION(CopyImage3Dto2D)
+            TEST_ACTION(CopyImage3Dto3D)
+            TEST_ACTION(Copy3DImageToBuffer)
+            TEST_ACTION(CopyBufferTo3DImage)
     return retVal;
-static bool sSimultaneousFlags[ 54 ];// for 18 actions with 3 callback status
+static bool sSimultaneousFlags[54]; // for 18 actions with 3 callback status
 static volatile int sSimultaneousCount;
-Action * actions[ 19 ] = { 0 };
+Action *actions[19] = { 0 };
 // Callback for the simultaneous tests
-void CL_CALLBACK simultaneous_event_callback_function( cl_event event, cl_int commandStatus, void * userData )
+void CL_CALLBACK simultaneous_event_callback_function(cl_event event,
+                                                      cl_int commandStatus,
+                                                      void *userData)
     int eventIndex = (int)(size_t)userData;
-  int actionIndex = eventIndex/EVENT_CALLBACK_TYPE_TOTAL;
-  int statusIndex = eventIndex%EVENT_CALLBACK_TYPE_TOTAL;
-    log_info( "\tEvent callback triggered for action %s callback type %s \n", actions[actionIndex]->GetName(), IGetStatusString(statusIndex) );
-    sSimultaneousFlags[ actionIndex ] = true;
-    ThreadPool_AtomicAdd(&sSimultaneousCount,1);
+    int actionIndex = eventIndex / EVENT_CALLBACK_TYPE_TOTAL;
+    int statusIndex = eventIndex % EVENT_CALLBACK_TYPE_TOTAL;
+    log_info("\tEvent callback triggered for action %s callback type %s \n",
+             actions[actionIndex]->GetName(), IGetStatusString(statusIndex));
+    sSimultaneousFlags[actionIndex] = true;
+    ThreadPool_AtomicAdd(&sSimultaneousCount, 1);
-int test_callbacks_simultaneous( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_callbacks_simultaneous(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     cl_int error;
-    // Unlike the singles test, in this one, we run a bunch of events all at once, to verify that
-    // the callbacks do get called once-and-only-once for each event, even if the run out of order or
-    // are dependent on each other
+    // Unlike the singles test, in this one, we run a bunch of events all at
+    // once, to verify that the callbacks do get called once-and-only-once for
+    // each event, even if the run out of order or are dependent on each other
     // First, the list of actions to run
     int actionCount = 0, index = 0;
-    actions[ index++ ] = new NDRangeKernelAction();
-    actions[ index++ ] = new ReadBufferAction();
-    actions[ index++ ] = new WriteBufferAction();
-    actions[ index++ ] = new MapBufferAction();
-    actions[ index++ ] = new UnmapBufferAction();
+    actions[index++] = new NDRangeKernelAction();
+    actions[index++] = new ReadBufferAction();
+    actions[index++] = new WriteBufferAction();
+    actions[index++] = new MapBufferAction();
+    actions[index++] = new UnmapBufferAction();
-    if( checkForImageSupport( deviceID ) != CL_IMAGE_FORMAT_NOT_SUPPORTED )
+    if (checkForImageSupport(deviceID) != CL_IMAGE_FORMAT_NOT_SUPPORTED)
-        actions[ index++ ] = new ReadImage2DAction();
-        actions[ index++ ] = new WriteImage2DAction();
-        actions[ index++ ] = new CopyImage2Dto2DAction();
-        actions[ index++ ] = new Copy2DImageToBufferAction();
-        actions[ index++ ] = new CopyBufferTo2DImageAction();
-        actions[ index++ ] = new MapImageAction();
+        actions[index++] = new ReadImage2DAction();
+        actions[index++] = new WriteImage2DAction();
+        actions[index++] = new CopyImage2Dto2DAction();
+        actions[index++] = new Copy2DImageToBufferAction();
+        actions[index++] = new CopyBufferTo2DImageAction();
+        actions[index++] = new MapImageAction();
-        if( checkFor3DImageSupport( deviceID ) != CL_IMAGE_FORMAT_NOT_SUPPORTED )
+        if (checkFor3DImageSupport(deviceID) != CL_IMAGE_FORMAT_NOT_SUPPORTED)
-            actions[ index++ ] = new ReadImage3DAction();
-            actions[ index++ ] = new WriteImage3DAction();
-            actions[ index++ ] = new CopyImage2Dto3DAction();
-            actions[ index++ ] = new CopyImage3Dto2DAction();
-            actions[ index++ ] = new CopyImage3Dto3DAction();
-            actions[ index++ ] = new Copy3DImageToBufferAction();
-            actions[ index++ ] = new CopyBufferTo3DImageAction();
+            actions[index++] = new ReadImage3DAction();
+            actions[index++] = new WriteImage3DAction();
+            actions[index++] = new CopyImage2Dto3DAction();
+            actions[index++] = new CopyImage3Dto2DAction();
+            actions[index++] = new CopyImage3Dto3DAction();
+            actions[index++] = new Copy3DImageToBufferAction();
+            actions[index++] = new CopyBufferTo3DImageAction();
     actionCount = index;
-    actions[ index++ ] = NULL;
+    actions[index++] = NULL;
     // Now set them all up
-    log_info( "\tSetting up test events...\n" );
-    for( index = 0; actions[ index ] != NULL; index++ )
+    log_info("\tSetting up test events...\n");
+    for (index = 0; actions[index] != NULL; index++)
-        error = actions[ index ]->Setup( deviceID, context, queue );
-        test_error( error, "Unable to set up test action" );
-        sSimultaneousFlags[ index ] = false;
+        error = actions[index]->Setup(deviceID, context, queue);
+        test_error(error, "Unable to set up test action");
+        sSimultaneousFlags[index] = false;
     sSimultaneousCount = 0;
     // Set up the user event to start them all
-    clEventWrapper gateEvent = clCreateUserEvent( context, &error );
-    test_error( error, "Unable to set up user gate event" );
+    clEventWrapper gateEvent = clCreateUserEvent(context, &error);
+    test_error(error, "Unable to set up user gate event");
     // Start executing, all tied to the gate event
-    //clEventWrapper actionEvents[ 18 ];// current actionCount is 18
-    clEventWrapper *actionEvents= new clEventWrapper[actionCount];
+    // clEventWrapper actionEvents[ 18 ];// current actionCount is 18
+    clEventWrapper *actionEvents = new clEventWrapper[actionCount];
     if (actionEvents == NULL)
         log_error(" memory error in test_callbacks_simultaneous  \n");
-      for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i)
-        if (actions[i]) delete actions[i];
-        return  -1;
+        for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i)
+            if (actions[i]) delete actions[i];
+        return -1;
-    RandomSeed seed( gRandomSeed );
-    for( index = 0; actions[ index ] != NULL; index++ )
+    RandomSeed seed(gRandomSeed);
+    for (index = 0; actions[index] != NULL; index++)
         // Randomly choose to wait on the gate, or wait on the previous event
-        cl_event * eventPtr = &gateEvent;
-        if( ( index > 0 ) && ( random_in_range( 0, 255, seed ) & 1 ) )
-            eventPtr = &actionEvents[ index - 1 ];
+        cl_event *eventPtr = &gateEvent;
+        if ((index > 0) && (random_in_range(0, 255, seed) & 1))
+            eventPtr = &actionEvents[index - 1];
-        error = actions[ index ]->Execute( queue, 1, eventPtr, &actionEvents[ index ] );
-        test_error( error, "Unable to execute test action" );
+        error =
+            actions[index]->Execute(queue, 1, eventPtr, &actionEvents[index]);
+        test_error(error, "Unable to execute test action");
-    for( int k=0; k< EVENT_CALLBACK_TYPE_TOTAL; k++)
-    {
-       error = clSetEventCallback( actionEvents[index], event_callback_types[k], simultaneous_event_callback_function, (void *)(size_t)(index*EVENT_CALLBACK_TYPE_TOTAL+k ) );
-       test_error( error, "Unable to set event callback function" );
-    }
+        for (int k = 0; k < EVENT_CALLBACK_TYPE_TOTAL; k++)
+        {
+            error = clSetEventCallback(
+                actionEvents[index], event_callback_types[k],
+                simultaneous_event_callback_function,
+                (void *)(size_t)(index * EVENT_CALLBACK_TYPE_TOTAL + k));
+            test_error(error, "Unable to set event callback function");
+        }
-  int total_callbacks= actionCount * EVENT_CALLBACK_TYPE_TOTAL;
+    int total_callbacks = actionCount * EVENT_CALLBACK_TYPE_TOTAL;
     // Now release the user event, which will allow our actual action to run
-    error = clSetUserEventStatus( gateEvent, CL_COMPLETE );
-    test_error( error, "Unable to trigger gate event" );
+    error = clSetUserEventStatus(gateEvent, CL_COMPLETE);
+    test_error(error, "Unable to trigger gate event");
     // Wait on the actual action events now
-    log_info( "\tWaiting for test completions...\n" );
-    error = clWaitForEvents( actionCount, &actionEvents[ 0 ] );
-    test_error( error, "Unable to wait for actual test events" );
+    log_info("\tWaiting for test completions...\n");
+    error = clWaitForEvents(actionCount, &actionEvents[0]);
+    test_error(error, "Unable to wait for actual test events");
-    // Note: we can check our callback now, and it MIGHT have been triggered, but that's not guaranteed
-  int last_count = 0;
-    if( ((last_count = sSimultaneousCount)) == total_callbacks)
+    // Note: we can check our callback now, and it MIGHT have been triggered,
+    // but that's not guaranteed
+    int last_count = 0;
+    if (((last_count = sSimultaneousCount)) == total_callbacks)
         // We're all good, so return success
-        log_info( "\t%d of %d callbacks received\n", sSimultaneousCount, total_callbacks );
+        log_info("\t%d of %d callbacks received\n", sSimultaneousCount,
+                 total_callbacks);
-        if (actionEvents) delete [] actionEvents;
-    for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i)
-      if (actions[i]) delete actions[i];
+        if (actionEvents) delete[] actionEvents;
+        for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i)
+            if (actions[i]) delete actions[i];
         return 0;
     // We haven't gotten (all) of the callbacks, so wait for them
-    log_info( "\tWe've only received %d of the %d callbacks we expected; waiting for more...\n", last_count, total_callbacks );
+    log_info("\tWe've only received %d of the %d callbacks we expected; "
+             "waiting for more...\n",
+             last_count, total_callbacks);
-    for( int i = 0; i < 10 * 10; i++ )
+    for (int i = 0; i < 10 * 10; i++)
-        usleep( 100000 );    // 1/10th second
-        if( ((last_count = sSimultaneousCount)) == total_callbacks )
+        usleep(100000); // 1/10th second
+        if (((last_count = sSimultaneousCount)) == total_callbacks)
-      // All of the callbacks were executed
-      if (actionEvents) delete [] actionEvents;
-      for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i)
-        if (actions[i]) delete actions[i];
-        return 0;
+            // All of the callbacks were executed
+            if (actionEvents) delete[] actionEvents;
+            for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i)
+                if (actions[i]) delete actions[i];
+            return 0;
     // If we got here, some of the callbacks did not occur in time
-    log_error( "\nError: We only ever received %d of our %d callbacks!\n", last_count, total_callbacks );
-    log_error( "Events that did not receive callbacks:\n" );
-    for( index = 0; actions[ index ] != NULL; index++ )
+    log_error("\nError: We only ever received %d of our %d callbacks!\n",
+              last_count, total_callbacks);
+    log_error("Events that did not receive callbacks:\n");
+    for (index = 0; actions[index] != NULL; index++)
-        if( !sSimultaneousFlags[ index ] )
-            log_error( "\t%s\n", actions[ index ]->GetName() );
+        if (!sSimultaneousFlags[index])
+            log_error("\t%s\n", actions[index]->GetName());
-  if (actionEvents) delete [] actionEvents;
+    if (actionEvents) delete[] actionEvents;
     return -1;
diff --git a/test_conformance/events/test_event_dependencies.cpp b/test_conformance/events/test_event_dependencies.cpp
index 4113654..45b260a 100644
--- a/test_conformance/events/test_event_dependencies.cpp
+++ b/test_conformance/events/test_event_dependencies.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -39,61 +39,79 @@
  Tests event dependencies by running two kernels that use the same buffer.
  If two_queues is set they are run in separate queues.
- If test_enqueue_wait_for_events is set then clEnqueueWaitForEvent is called between them.
- If test_barrier is set then clEnqueueBarrier is called between them (only for single queue).
- If neither are set, nothing is done to prevent them from executing in the wrong order. This can be used for verification.
+ If test_enqueue_wait_for_events is set then clEnqueueWaitForEvent is called
+ between them. If test_barrier is set then clEnqueueBarrier is called between
+ them (only for single queue). If neither are set, nothing is done to prevent
+ them from executing in the wrong order. This can be used for verification.
-int test_event_enqueue_wait_for_events_run_test( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int two_queues, int two_devices,
-                                                int test_enqueue_wait_for_events, int test_barrier, int use_waitlist, int use_marker)
+int test_event_enqueue_wait_for_events_run_test(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements, int two_queues, int two_devices,
+    int test_enqueue_wait_for_events, int test_barrier, int use_waitlist,
+    int use_marker)
     cl_int error = CL_SUCCESS;
-    size_t threads[3] = {TEST_SIZE,0,0};
+    size_t threads[3] = { TEST_SIZE, 0, 0 };
     int i, loop_count, event_count, expected_value, failed;
     int expected_if_only_queue[2];
     int max_count = TEST_SIZE;
     cl_platform_id platform;
-    cl_command_queue queues[2];    // Not a wrapper so we don't autorelease if they are the same
-    clCommandQueueWrapper queueWrappers[2];    // If they are different, we use the wrapper so it will auto release
+    cl_command_queue
+        queues[2]; // Not a wrapper so we don't autorelease if they are the same
+    clCommandQueueWrapper queueWrappers[2]; // If they are different, we use the
+                                            // wrapper so it will auto release
     clContextWrapper context_to_use;
     clMemWrapper data;
     clProgramWrapper program;
     clKernelWrapper kernel1[TEST_COUNT], kernel2[TEST_COUNT];
-    clEventWrapper event[TEST_COUNT*4+2]; // If we usemarkers we get 2 more events per iteration
+    clEventWrapper event[TEST_COUNT * 4 + 2]; // If we usemarkers we get 2 more
+                                              // events per iteration
     if (test_enqueue_wait_for_events)
-        log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier function.\n");
+        log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier "
+                 "function.\n");
     if (test_barrier)
-        log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier function.\n");
+        log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier "
+                 "function.\n");
     if (use_waitlist)
-        log_info("\tTesting with waitlist-based depenednecies between kernels.\n");
+        log_info(
+            "\tTesting with waitlist-based depenednecies between kernels.\n");
     if (use_marker)
         log_info("\tTesting with clEnqueueMarker as a barrier function.\n");
-    if (test_barrier && (two_queues || two_devices)) {
-        log_error("\tTest requested with clEnqueueBarrier across two queues. This is not a valid combination.\n");
+    if (test_barrier && (two_queues || two_devices))
+    {
+        log_error("\tTest requested with clEnqueueBarrier across two queues. "
+                  "This is not a valid combination.\n");
         return -1;
     error = clGetPlatformIDs(1, &platform, NULL);
     test_error(error, "clGetPlatformIDs failed.");
-    // If we are to use two devices, then get them and create a context with both.
+    // If we are to use two devices, then get them and create a context with
+    // both.
     cl_device_id *two_device_ids;
-    if (two_devices) {
-        two_device_ids = (cl_device_id*)malloc(sizeof(cl_device_id)*2);
+    if (two_devices)
+    {
+        two_device_ids = (cl_device_id *)malloc(sizeof(cl_device_id) * 2);
         cl_uint number_returned;
-        error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, two_device_ids, &number_returned);
-        test_error( error, "clGetDeviceIDs for CL_DEVICE_TYPE_ALL failed.");
-        if (number_returned != 2) {
+        error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, two_device_ids,
+                               &number_returned);
+        test_error(error, "clGetDeviceIDs for CL_DEVICE_TYPE_ALL failed.");
+        if (number_returned != 2)
+        {
             log_info("Failed to obtain two devices. Test can not run.\n");
             return 0;
-        for (i=0; i<2; i++) {
+        for (i = 0; i < 2; i++)
+        {
             cl_device_type type;
-            error = clGetDeviceInfo(two_device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
-            test_error( error, "clGetDeviceInfo failed.");
+            error = clGetDeviceInfo(two_device_ids[i], CL_DEVICE_TYPE,
+                                    sizeof(cl_device_type), &type, NULL);
+            test_error(error, "clGetDeviceInfo failed.");
             if (type & CL_DEVICE_TYPE_CPU)
                 log_info("\tDevice %d is CL_DEVICE_TYPE_CPU.\n", i);
             if (type & CL_DEVICE_TYPE_GPU)
@@ -104,12 +122,16 @@
                 log_info("\tDevice %d is CL_DEVICE_TYPE_DEFAULT.\n", i);
-        context_to_use = clCreateContext(NULL, 2, two_device_ids, notify_callback, NULL, &error);
+        context_to_use = clCreateContext(NULL, 2, two_device_ids,
+                                         notify_callback, NULL, &error);
         test_error(error, "clCreateContext failed for two devices.");
         log_info("\tTesting with two devices.\n");
-    } else {
-        context_to_use = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &error);
+    }
+    else
+    {
+        context_to_use =
+            clCreateContext(NULL, 1, &deviceID, NULL, NULL, &error);
         test_error(error, "clCreateContext failed for one device.");
         log_info("\tTesting with one device.\n");
@@ -117,41 +139,55 @@
     // If we are using two queues then create them
     cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    if (two_queues) {
+    if (two_queues)
+    {
         // Get a second queue
         if (two_devices)
-            if( !checkDeviceForQueueSupport( two_device_ids[ 0 ], props ) ||
-               !checkDeviceForQueueSupport( two_device_ids[ 1 ], props ) )
+            if (!checkDeviceForQueueSupport(two_device_ids[0], props)
+                || !checkDeviceForQueueSupport(two_device_ids[1], props))
-                log_info( "WARNING: One or more device for multi-device test does not support out-of-order exec mode; skipping test.\n" );
+                log_info(
+                    "WARNING: One or more device for multi-device test does "
+                    "not support out-of-order exec mode; skipping test.\n");
                 return -1942;
-            queueWrappers[0] = clCreateCommandQueue(context_to_use, two_device_ids[0], props, &error);
-            test_error(error, "clCreateCommandQueue for first queue on first device failed.");
-            queueWrappers[1] = clCreateCommandQueue(context_to_use, two_device_ids[1], props, &error);
-            test_error(error, "clCreateCommandQueue for second queue on second device failed.");
+            queueWrappers[0] = clCreateCommandQueue(
+                context_to_use, two_device_ids[0], props, &error);
+            test_error(
+                error,
+                "clCreateCommandQueue for first queue on first device failed.");
+            queueWrappers[1] = clCreateCommandQueue(
+                context_to_use, two_device_ids[1], props, &error);
+            test_error(error,
+                       "clCreateCommandQueue for second queue on second device "
+                       "failed.");
-            // Single device has already been checked for out-of-order exec support
-            queueWrappers[0] = clCreateCommandQueue(context_to_use, deviceID, props, &error);
+            // Single device has already been checked for out-of-order exec
+            // support
+            queueWrappers[0] =
+                clCreateCommandQueue(context_to_use, deviceID, props, &error);
             test_error(error, "clCreateCommandQueue for first queue failed.");
-            queueWrappers[1] = clCreateCommandQueue(context_to_use, deviceID, props, &error);
+            queueWrappers[1] =
+                clCreateCommandQueue(context_to_use, deviceID, props, &error);
             test_error(error, "clCreateCommandQueue for second queue failed.");
-        // Ugly hack to make sure we only have the wrapper auto-release if they are different queues
+        // Ugly hack to make sure we only have the wrapper auto-release if they
+        // are different queues
         queues[0] = queueWrappers[0];
         queues[1] = queueWrappers[1];
         log_info("\tTesting with two queues.\n");
-        // (Note: single device has already been checked for out-of-order exec support)
-        // Otherwise create one queue and have the second one be the same
-        queueWrappers[0] = clCreateCommandQueue(context_to_use, deviceID, props, &error);
+        // (Note: single device has already been checked for out-of-order exec
+        // support) Otherwise create one queue and have the second one be the
+        // same
+        queueWrappers[0] =
+            clCreateCommandQueue(context_to_use, deviceID, props, &error);
         test_error(error, "clCreateCommandQueue for first queue failed.");
         queues[0] = queueWrappers[0];
         queues[1] = (cl_command_queue)queues[0];
@@ -160,236 +196,346 @@
     // Setup - create a buffer and the two kernels
-    data = clCreateBuffer(context_to_use, CL_MEM_READ_WRITE, TEST_SIZE*sizeof(cl_int), NULL, &error);
-    test_error( error, "clCreateBuffer failed");
+    data = clCreateBuffer(context_to_use, CL_MEM_READ_WRITE,
+                          TEST_SIZE * sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed");
     // Initialize the values to zero
-    cl_int *values = (cl_int*)malloc(TEST_SIZE*sizeof(cl_int));
-    for (i=0; i<(int)TEST_SIZE; i++)
-        values[i] = 0;
-    error = clEnqueueWriteBuffer(queues[0], data, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), values, 0, NULL, NULL);
-    test_error( error, "clEnqueueWriteBuffer failed");
+    cl_int *values = (cl_int *)malloc(TEST_SIZE * sizeof(cl_int));
+    for (i = 0; i < (int)TEST_SIZE; i++) values[i] = 0;
+    error =
+        clEnqueueWriteBuffer(queues[0], data, CL_TRUE, 0,
+                             TEST_SIZE * sizeof(cl_int), values, 0, NULL, NULL);
+    test_error(error, "clEnqueueWriteBuffer failed");
     expected_value = 0;
     // Build the kernels
-    if (create_single_kernel_helper( context_to_use, &program, &kernel1[0], 1, write_kernels, "write_up" ))
+    if (create_single_kernel_helper(context_to_use, &program, &kernel1[0], 1,
+                                    write_kernels, "write_up"))
         return -1;
     error = clSetKernelArg(kernel1[0], 0, sizeof(data), &data);
     error |= clSetKernelArg(kernel1[0], 1, sizeof(max_count), &max_count);
-    test_error( error, "clSetKernelArg 1 failed");
+    test_error(error, "clSetKernelArg 1 failed");
-    for (i=1; i<TEST_COUNT; i++) {
+    for (i = 1; i < TEST_COUNT; i++)
+    {
         kernel1[i] = clCreateKernel(program, "write_up", &error);
-        test_error( error, "clCreateKernel 1 failed");
+        test_error(error, "clCreateKernel 1 failed");
         error = clSetKernelArg(kernel1[i], 0, sizeof(data), &data);
         error |= clSetKernelArg(kernel1[i], 1, sizeof(max_count), &max_count);
-        test_error( error, "clSetKernelArg 1 failed");
+        test_error(error, "clSetKernelArg 1 failed");
-    for (i=0; i<TEST_COUNT; i++) {
+    for (i = 0; i < TEST_COUNT; i++)
+    {
         kernel2[i] = clCreateKernel(program, "write_down", &error);
-        test_error( error, "clCreateKernel 2 failed");
+        test_error(error, "clCreateKernel 2 failed");
         error = clSetKernelArg(kernel2[i], 0, sizeof(data), &data);
         error |= clSetKernelArg(kernel2[i], 1, sizeof(max_count), &max_count);
-        test_error( error, "clSetKernelArg 2 failed");
+        test_error(error, "clSetKernelArg 2 failed");
-    // Execution - run the first kernel, then enqueue the wait on the events, then the second kernel
-    // If clEnqueueBarrierWithWaitList works, the buffer will be filled with 1s, then multiplied by 4s,
-    // then incremented to 5s, repeatedly. Otherwise the values may be 2s (if the first one doesn't work) or 8s
-    // (if the second one doesn't work).
+    // Execution - run the first kernel, then enqueue the wait on the events,
+    // then the second kernel If clEnqueueBarrierWithWaitList works, the buffer
+    // will be filled with 1s, then multiplied by 4s, then incremented to 5s,
+    // repeatedly. Otherwise the values may be 2s (if the first one doesn't
+    // work) or 8s (if the second one doesn't work).
     if (RANDOMIZE)
         log_info("Queues chosen randomly for each kernel execution.\n");
         log_info("Queues chosen alternatily for each kernel execution.\n");
     event_count = 0;
-    for (i=0; i<(int)TEST_SIZE; i++)
-        values[i] = 1;
-    error = clEnqueueWriteBuffer(queues[0], data, CL_FALSE, 0, TEST_SIZE*sizeof(cl_int), values, 0, NULL, &event[event_count]);
-    test_error( error, "clEnqueueWriteBuffer 2 failed");
+    for (i = 0; i < (int)TEST_SIZE; i++) values[i] = 1;
+    error = clEnqueueWriteBuffer(queues[0], data, CL_FALSE, 0,
+                                 TEST_SIZE * sizeof(cl_int), values, 0, NULL,
+                                 &event[event_count]);
+    test_error(error, "clEnqueueWriteBuffer 2 failed");
     expected_value = 1;
     expected_if_only_queue[0] = 1;
     expected_if_only_queue[1] = 1;
     int queue_to_use = 1;
-    if (test_enqueue_wait_for_events) {
-        error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL );
-        test_error( error, "Unable to queue wait for events" );
-    } else if (test_barrier) {
-        error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL);
-        test_error( error, "Unable to queue barrier" );
+    if (test_enqueue_wait_for_events)
+    {
+        error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1,
+                                             &event[event_count], NULL);
+        test_error(error, "Unable to queue wait for events");
+    }
+    else if (test_barrier)
+    {
+        error =
+            clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL);
+        test_error(error, "Unable to queue barrier");
-    for (loop_count=0; loop_count<TEST_COUNT; loop_count++) {
+    for (loop_count = 0; loop_count < TEST_COUNT; loop_count++)
+    {
         // Execute kernel 1
-        if (use_waitlist | use_marker) {
-            if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, NULL, threads, NULL, 1, &event[%d], &event[%d])\n", queue_to_use, loop_count, event_count-1, event_count);
-            error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, NULL, 1, &event[event_count-1], &event[event_count]);
-        } else {
-            if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, NULL, threads, NULL, 0, NULL, &event[%d])\n", queue_to_use, loop_count, event_count);
-            error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, NULL, 0, NULL, &event[event_count]);
+        if (use_waitlist | use_marker)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, "
+                         "NULL, threads, NULL, 1, &event[%d], &event[%d])\n",
+                         queue_to_use, loop_count, event_count - 1,
+                         event_count);
+            error = clEnqueueNDRangeKernel(
+                queues[queue_to_use], kernel1[loop_count], 1, NULL, threads,
+                NULL, 1, &event[event_count - 1], &event[event_count]);
-        if (error) {
+        else
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, "
+                         "NULL, threads, NULL, 0, NULL, &event[%d])\n",
+                         queue_to_use, loop_count, event_count);
+            error = clEnqueueNDRangeKernel(
+                queues[queue_to_use], kernel1[loop_count], 1, NULL, threads,
+                NULL, 0, NULL, &event[event_count]);
+        }
+        if (error)
+        {
             log_info("\tLoop count %d\n", loop_count);
-            print_error( error, "clEnqueueNDRangeKernel for kernel 1 failed");
+            print_error(error, "clEnqueueNDRangeKernel for kernel 1 failed");
             return error;
         expected_value *= 2;
         expected_if_only_queue[queue_to_use] *= 2;
         // If we are using a marker, it needs to go in the same queue
-        if (use_marker) {
+        if (use_marker)
+        {
-            if (DEBUG_OUT) log_info("clEnqueueMarker(queues[%d], event[%d])\n", queue_to_use, event_count);
+            if (DEBUG_OUT)
+                log_info("clEnqueueMarker(queues[%d], event[%d])\n",
+                         queue_to_use, event_count);
-            #ifdef CL_VERSION_1_2
-                error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, &event[event_count]);
-            #else
-                error = clEnqueueMarker(queues[queue_to_use], &event[event_count]);
-            #endif
+#ifdef CL_VERSION_1_2
+            error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL,
+                                                &event[event_count]);
+            error = clEnqueueMarker(queues[queue_to_use], &event[event_count]);
         // Pick the next queue to run
         if (RANDOMIZE)
-            queue_to_use = rand()%2;
+            queue_to_use = rand() % 2;
-            queue_to_use = (queue_to_use + 1)%2;
+            queue_to_use = (queue_to_use + 1) % 2;
         // Put in a barrier if requested
-        if (test_enqueue_wait_for_events) {
-            if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, &event[%d], NULL)\n", queue_to_use, event_count);
-            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL);
-            test_error( error, "Unable to queue wait for events" );
-        } else if (test_barrier) {
-            if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", queue_to_use);
-            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL);
-            test_error( error, "Unable to queue barrier" );
+        if (test_enqueue_wait_for_events)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, "
+                         "&event[%d], NULL)\n",
+                         queue_to_use, event_count);
+            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1,
+                                                 &event[event_count], NULL);
+            test_error(error, "Unable to queue wait for events");
+        }
+        else if (test_barrier)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueBarrierWithWaitList(queues[%d])\n",
+                         queue_to_use);
+            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL,
+                                                 NULL);
+            test_error(error, "Unable to queue barrier");
         // Execute Kernel 2
-        if (use_waitlist | use_marker) {
-            if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, NULL, threads, NULL, 1, &event[%d], &event[%d])\n", queue_to_use, loop_count, event_count-1, event_count);
-            error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, NULL, 1, &event[event_count-1], &event[event_count]);
-        } else {
-            if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, NULL, threads, NULL, 0, NULL, &event[%d])\n", queue_to_use, loop_count, event_count);
-            error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, NULL, 0, NULL, &event[event_count]);
+        if (use_waitlist | use_marker)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, "
+                         "NULL, threads, NULL, 1, &event[%d], &event[%d])\n",
+                         queue_to_use, loop_count, event_count - 1,
+                         event_count);
+            error = clEnqueueNDRangeKernel(
+                queues[queue_to_use], kernel2[loop_count], 1, NULL, threads,
+                NULL, 1, &event[event_count - 1], &event[event_count]);
-        if (error) {
+        else
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, "
+                         "NULL, threads, NULL, 0, NULL, &event[%d])\n",
+                         queue_to_use, loop_count, event_count);
+            error = clEnqueueNDRangeKernel(
+                queues[queue_to_use], kernel2[loop_count], 1, NULL, threads,
+                NULL, 0, NULL, &event[event_count]);
+        }
+        if (error)
+        {
             log_info("\tLoop count %d\n", loop_count);
-            print_error( error, "clEnqueueNDRangeKernel for kernel 2 failed");
+            print_error(error, "clEnqueueNDRangeKernel for kernel 2 failed");
             return error;
         // If we are using a marker, it needs to go in the same queue
-        if (use_marker) {
+        if (use_marker)
+        {
-            if (DEBUG_OUT) log_info("clEnqueueMarker(queues[%d], event[%d])\n", queue_to_use, event_count);
+            if (DEBUG_OUT)
+                log_info("clEnqueueMarker(queues[%d], event[%d])\n",
+                         queue_to_use, event_count);
-        #ifdef CL_VERSION_1_2
-            error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, &event[event_count]);
-        #else
+#ifdef CL_VERSION_1_2
+            error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL,
+                                                &event[event_count]);
             error = clEnqueueMarker(queues[queue_to_use], &event[event_count]);
-        #endif
         // Pick the next queue to run
         if (RANDOMIZE)
-            queue_to_use = rand()%2;
+            queue_to_use = rand() % 2;
-            queue_to_use = (queue_to_use + 1)%2;
+            queue_to_use = (queue_to_use + 1) % 2;
         // Put in a barrier if requested
-        if (test_enqueue_wait_for_events) {
-            if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, &event[%d], NULL)\n", queue_to_use, event_count);
-            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL );
-            test_error( error, "Unable to queue wait for events" );
-        } else if (test_barrier) {
-            if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", queue_to_use);
-            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL);
-            test_error( error, "Unable to queue barrier" );
+        if (test_enqueue_wait_for_events)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, "
+                         "&event[%d], NULL)\n",
+                         queue_to_use, event_count);
+            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1,
+                                                 &event[event_count], NULL);
+            test_error(error, "Unable to queue wait for events");
+        }
+        else if (test_barrier)
+        {
+            if (DEBUG_OUT)
+                log_info("clEnqueueBarrierWithWaitList(queues[%d])\n",
+                         queue_to_use);
+            error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL,
+                                                 NULL);
+            test_error(error, "Unable to queue barrier");
     // Now finish up everything
-    if (two_queues) {
+    if (two_queues)
+    {
         error = clFlush(queues[1]);
-        test_error( error, "clFlush[1] failed");
+        test_error(error, "clFlush[1] failed");
-    error = clEnqueueReadBuffer(queues[0], data, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), values, 1, &event[event_count], NULL);
+    error = clEnqueueReadBuffer(queues[0], data, CL_TRUE, 0,
+                                TEST_SIZE * sizeof(cl_int), values, 1,
+                                &event[event_count], NULL);
     test_error(error, "clEnqueueReadBuffer failed");
     failed = 0;
-    for (i=0; i<(int)TEST_SIZE; i++)
-        if (values[i] != expected_value) {
+    for (i = 0; i < (int)TEST_SIZE; i++)
+        if (values[i] != expected_value)
+        {
             failed = 1;
-            log_info("\tvalues[%d] = %d, expected %d (If only queue 1 accessed memory: %d only queue 2 accessed memory: %d)\n",
-                     i, values[i], expected_value, expected_if_only_queue[0], expected_if_only_queue[1]);
+            log_info("\tvalues[%d] = %d, expected %d (If only queue 1 accessed "
+                     "memory: %d only queue 2 accessed memory: %d)\n",
+                     i, values[i], expected_value, expected_if_only_queue[0],
+                     expected_if_only_queue[1]);
-    if (two_devices)
-        free(two_device_ids);
+    if (two_devices) free(two_device_ids);
     return failed;
-int test( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-         int two_queues, int two_devices,
-         int test_enqueue_wait_for_events, int test_barrier, int use_waitlists, int use_marker)
+int test(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+         int num_elements, int two_queues, int two_devices,
+         int test_enqueue_wait_for_events, int test_barrier, int use_waitlists,
+         int use_marker)
-    if( !checkDeviceForQueueSupport( deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) )
+    if (!checkDeviceForQueueSupport(deviceID,
+                                    CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
-        log_info( "WARNING: Device does not support out-of-order exec mode; skipping test.\n" );
+        log_info("WARNING: Device does not support out-of-order exec mode; "
+                 "skipping test.\n");
         return 0;
-    log_info("Running test for baseline results to determine if out-of-order execution can be detected...\n");
-    int baseline_results = test_event_enqueue_wait_for_events_run_test(deviceID, context, queue, num_elements, two_queues, two_devices, 0, 0, 0, 0);
-    if (baseline_results == 0) {
+    log_info("Running test for baseline results to determine if out-of-order "
+             "execution can be detected...\n");
+    int baseline_results = test_event_enqueue_wait_for_events_run_test(
+        deviceID, context, queue, num_elements, two_queues, two_devices, 0, 0,
+        0, 0);
+    if (baseline_results == 0)
+    {
         if (test_enqueue_wait_for_events)
-            log_info("WARNING: could not detect any out-of-order execution without using clEnqueueBarrierWithWaitList, so this test is not a valid test of out-of-order event dependencies.\n");
+            log_info(
+                "WARNING: could not detect any out-of-order execution without "
+                "using clEnqueueBarrierWithWaitList, so this test is not a "
+                "valid test of out-of-order event dependencies.\n");
         if (test_barrier)
-            log_info("WARNING: could not detect any out-of-order execution without using clEnqueueBarrierWithWaitList, so this test is not a valid test of out-of-order event dependencies.\n");
+            log_info(
+                "WARNING: could not detect any out-of-order execution without "
+                "using clEnqueueBarrierWithWaitList, so this test is not a "
+                "valid test of out-of-order event dependencies.\n");
         if (use_waitlists)
-            log_info("WARNING: could not detect any out-of-order execution without using waitlists, so this test is not a valid test of out-of-order event dependencies.\n");
+            log_info("WARNING: could not detect any out-of-order execution "
+                     "without using waitlists, so this test is not a valid "
+                     "test of out-of-order event dependencies.\n");
         if (use_marker)
-            log_info("WARNING: could not detect any out-of-order execution without using clEnqueueMarker, so this test is not a valid test of out-of-order event dependencies.\n");
-    } else if (baseline_results == 1) {
+            log_info("WARNING: could not detect any out-of-order execution "
+                     "without using clEnqueueMarker, so this test is not a "
+                     "valid test of out-of-order event dependencies.\n");
+    }
+    else if (baseline_results == 1)
+    {
         if (test_enqueue_wait_for_events)
-            log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueBarrierWithWaitList. Test can be a valid test of out-of-order event dependencies.\n");
+            log_info("Detected incorrect execution (possibly out-of-order) "
+                     "without clEnqueueBarrierWithWaitList. Test can be a "
+                     "valid test of out-of-order event dependencies.\n");
         if (test_barrier)
-            log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueBarrierWithWaitList. Test can be a valid test of out-of-order event dependencies.\n");
+            log_info("Detected incorrect execution (possibly out-of-order) "
+                     "without clEnqueueBarrierWithWaitList. Test can be a "
+                     "valid test of out-of-order event dependencies.\n");
         if (use_waitlists)
-            log_info("Detected incorrect execution (possibly out-of-order) without waitlists. Test can be a valid test of out-of-order event dependencies.\n");
+            log_info("Detected incorrect execution (possibly out-of-order) "
+                     "without waitlists. Test can be a valid test of "
+                     "out-of-order event dependencies.\n");
         if (use_marker)
-            log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueMarker. Test can be a valid test of out-of-order event dependencies.\n");
-    } else if( baseline_results == -1942 ) {
+            log_info("Detected incorrect execution (possibly out-of-order) "
+                     "without clEnqueueMarker. Test can be a valid test of "
+                     "out-of-order event dependencies.\n");
+    }
+    else if (baseline_results == -1942)
+    {
         // Just ignore and return (out-of-order exec mode not supported)
         return 0;
-    } else {
+    }
+    else
+    {
         print_error(baseline_results, "Baseline run failed");
         return baseline_results;
     log_info("Running test for actual results...\n");
-    return test_event_enqueue_wait_for_events_run_test(deviceID, context, queue, num_elements, two_queues, two_devices,
-                                                       test_enqueue_wait_for_events, test_barrier,  use_waitlists, use_marker);
+    return test_event_enqueue_wait_for_events_run_test(
+        deviceID, context, queue, num_elements, two_queues, two_devices,
+        test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
-int test_out_of_order_event_waitlist_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID,
+                                                  cl_context context,
+                                                  cl_command_queue queue,
+                                                  int num_elements)
     int two_queues = 0;
     int two_devices = 0;
@@ -397,10 +543,15 @@
     int test_barrier = 0;
     int use_waitlists = 1;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_waitlist_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_waitlist_multi_queue(cl_device_id deviceID,
+                                                 cl_context context,
+                                                 cl_command_queue queue,
+                                                 int num_elements)
     int two_queues = 1;
     int two_devices = 0;
@@ -408,10 +559,14 @@
     int test_barrier = 0;
     int use_waitlists = 1;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_waitlist_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_waitlist_multi_queue_multi_device(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
     int two_queues = 1;
     int two_devices = 1;
@@ -419,11 +574,15 @@
     int test_barrier = 0;
     int use_waitlists = 1;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_wait_for_events_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_wait_for_events_single_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
     int two_queues = 0;
     int two_devices = 0;
@@ -431,10 +590,14 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_wait_for_events_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_wait_for_events_multi_queue(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
     int two_queues = 1;
     int two_devices = 0;
@@ -442,11 +605,15 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
     int two_queues = 1;
     int two_devices = 1;
@@ -454,13 +621,16 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_barrier_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_barrier_single_queue(cl_device_id deviceID,
+                                                         cl_context context,
+                                                         cl_command_queue queue,
+                                                         int num_elements)
     int two_queues = 0;
     int two_devices = 0;
@@ -468,11 +638,16 @@
     int test_barrier = 1;
     int use_waitlists = 0;
     int use_marker = 0;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_marker_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_marker_single_queue(cl_device_id deviceID,
+                                                        cl_context context,
+                                                        cl_command_queue queue,
+                                                        int num_elements)
     int two_queues = 0;
     int two_devices = 0;
@@ -480,10 +655,15 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 1;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_marker_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_marker_multi_queue(cl_device_id deviceID,
+                                                       cl_context context,
+                                                       cl_command_queue queue,
+                                                       int num_elements)
     int two_queues = 1;
     int two_devices = 0;
@@ -491,11 +671,15 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 1;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
-int test_out_of_order_event_enqueue_marker_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_out_of_order_event_enqueue_marker_multi_queue_multi_device(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
     int two_queues = 1;
     int two_devices = 1;
@@ -503,7 +687,7 @@
     int test_barrier = 0;
     int use_waitlists = 0;
     int use_marker = 1;
-    return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker);
+    return test(deviceID, context, queue, num_elements, two_queues, two_devices,
+                test_enqueue_wait_for_events, test_barrier, use_waitlists,
+                use_marker);
diff --git a/test_conformance/events/test_events.cpp b/test_conformance/events/test_events.cpp
index 26693f9..34157fa 100644
--- a/test_conformance/events/test_events.cpp
+++ b/test_conformance/events/test_events.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -15,97 +15,112 @@
 #include "testBase.h"
-#if ! defined( _WIN32 )
-    #include "unistd.h" // for "sleep" used in the "while (1)" busy wait loop in
+#if !defined(_WIN32)
+#include "unistd.h" // for "sleep" used in the "while (1)" busy wait loop in
 // test_event_flush
 const char *sample_long_test_kernel[] = {
-"__kernel void sample_test(__global float *src, __global int *dst)\n"
-"    int  tid = get_global_id(0);\n"
-"     int  i;\n"
-"    for( i = 0; i < 10000; i++ )\n"
-"    {\n"
-"        dst[tid] = (int)src[tid] * 3;\n"
-"    }\n"
-"}\n" };
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     int  i;\n"
+    "\n"
+    "    for( i = 0; i < 10000; i++ )\n"
+    "    {\n"
+    "        dst[tid] = (int)src[tid] * 3;\n"
+    "    }\n"
+    "\n"
+    "}\n"
-int create_and_execute_kernel( cl_context inContext, cl_command_queue inQueue, cl_program *outProgram, cl_kernel *outKernel, cl_mem *streams,
-                              unsigned int lineCount, const char **lines, const char *kernelName, cl_event *outEvent )
+int create_and_execute_kernel(cl_context inContext, cl_command_queue inQueue,
+                              cl_program *outProgram, cl_kernel *outKernel,
+                              cl_mem *streams, unsigned int lineCount,
+                              const char **lines, const char *kernelName,
+                              cl_event *outEvent)
     size_t threads[1] = { 1000 }, localThreads[1];
     int error;
-    if( create_single_kernel_helper( inContext, outProgram, outKernel, lineCount, lines, kernelName ) )
+    if (create_single_kernel_helper(inContext, outProgram, outKernel, lineCount,
+                                    lines, kernelName))
         return -1;
-    error = get_max_common_work_group_size( inContext, *outKernel, threads[0], &localThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error = get_max_common_work_group_size(inContext, *outKernel, threads[0],
+                                           &localThreads[0]);
+    test_error(error, "Unable to get work group size to use");
     streams[0] = clCreateBuffer(inContext, CL_MEM_READ_WRITE,
                                 sizeof(cl_float) * 1000, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     streams[1] = clCreateBuffer(inContext, CL_MEM_READ_WRITE,
                                 sizeof(cl_int) * 1000, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     /* Set the arguments */
-    error = clSetKernelArg( *outKernel, 0, sizeof( streams[0] ), &streams[0] );
-    test_error( error, "Unable to set kernel arguments" );
-    error = clSetKernelArg( *outKernel, 1, sizeof( streams[1] ), &streams[1] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(*outKernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel arguments");
+    error = clSetKernelArg(*outKernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel arguments");
-    error = clEnqueueNDRangeKernel(inQueue, *outKernel, 1, NULL, threads, localThreads, 0, NULL, outEvent);
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(inQueue, *outKernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, outEvent);
+    test_error(error, "Unable to execute test kernel");
     return 0;
-#define SETUP_EVENT( c, q ) \
-clProgramWrapper program; \
-clKernelWrapper kernel; \
-clMemWrapper streams[2]; \
-clEventWrapper event; \
-int error; \
-if( create_and_execute_kernel( c, q, &program, &kernel, &streams[0], 1, sample_long_test_kernel, "sample_test", &event ) ) return -1;
+#define SETUP_EVENT(c, q)                                                      \
+    clProgramWrapper program;                                                  \
+    clKernelWrapper kernel;                                                    \
+    clMemWrapper streams[2];                                                   \
+    clEventWrapper event;                                                      \
+    int error;                                                                 \
+    if (create_and_execute_kernel(c, q, &program, &kernel, &streams[0], 1,     \
+                                  sample_long_test_kernel, "sample_test",      \
+                                  &event))                                     \
+        return -1;
 #define FINISH_EVENT(_q) clFinish(_q)
-const char *IGetStatusString( cl_int status )
+const char *IGetStatusString(cl_int status)
-    static char tempString[ 128 ];
-    switch( status )
+    static char tempString[128];
+    switch (status)
-        case CL_COMPLETE:    return "CL_COMPLETE";
-        case CL_RUNNING:    return "CL_RUNNING";
-        case CL_QUEUED:        return "CL_QUEUED";
-        case CL_SUBMITTED:    return "CL_SUBMITTED";
+        case CL_COMPLETE: return "CL_COMPLETE";
+        case CL_RUNNING: return "CL_RUNNING";
+        case CL_QUEUED: return "CL_QUEUED";
+        case CL_SUBMITTED: return "CL_SUBMITTED";
-            sprintf( tempString, "<unknown: %d>", (int)status );
+            sprintf(tempString, "<unknown: %d>", (int)status);
             return tempString;
 /* Note: tests clGetEventStatus and clReleaseEvent (implicitly) */
-int test_event_get_execute_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_get_execute_status(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     cl_int status;
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
     /* Now wait for it to be done */
-    error = clWaitForEvents( 1, &event );
-    test_error( error, "Unable to wait for event" );
+    error = clWaitForEvents(1, &event);
+    test_error(error, "Unable to wait for event");
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus to wait for event completion failed" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error,
+               "Calling clGetEventStatus to wait for event completion failed");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after event complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
@@ -113,57 +128,75 @@
     return 0;
-int test_event_get_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_get_info(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
     /* Verify parameters of clGetEventInfo not already tested by other tests */
     cl_command_queue otherQueue;
     size_t size;
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_QUEUE, sizeof( otherQueue ), &otherQueue, &size );
-    test_error( error, "Unable to get event info!" );
-    // We can not check if this is the right queue because this is an opaque object.
-    if( size != sizeof( queue ) )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_QUEUE, sizeof(otherQueue),
+                           &otherQueue, &size);
+    test_error(error, "Unable to get event info!");
+    // We can not check if this is the right queue because this is an opaque
+    // object.
+    if (size != sizeof(queue))
-        log_error( "ERROR: Returned command queue size does not validate (expected %d, got %d)\n", (int)sizeof( queue ), (int)size );
+        log_error("ERROR: Returned command queue size does not validate "
+                  "(expected %d, got %d)\n",
+                  (int)sizeof(queue), (int)size);
         return -1;
     cl_command_type type;
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_TYPE, sizeof( type ), &type, &size );
-    test_error( error, "Unable to get event info!" );
-    if( type != CL_COMMAND_NDRANGE_KERNEL )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type,
+                           &size);
+    test_error(error, "Unable to get event info!");
-        log_error( "ERROR: Returned command type does not validate (expected %d, got %d)\n", (int)CL_COMMAND_NDRANGE_KERNEL, (int)type );
+        log_error("ERROR: Returned command type does not validate (expected "
+                  "%d, got %d)\n",
+                  (int)CL_COMMAND_NDRANGE_KERNEL, (int)type);
         return -1;
-    if( size != sizeof( type ) )
+    if (size != sizeof(type))
-        log_error( "ERROR: Returned command type size does not validate (expected %d, got %d)\n", (int)sizeof( type ), (int)size );
+        log_error("ERROR: Returned command type size does not validate "
+                  "(expected %d, got %d)\n",
+                  (int)sizeof(type), (int)size);
         return -1;
     cl_uint count;
-    error = clGetEventInfo( event, CL_EVENT_REFERENCE_COUNT, sizeof( count ), &count, &size );
-    test_error( error, "Unable to get event info for CL_EVENT_REFERENCE_COUNT!" );
-    if( size != sizeof( count ) )
+    error = clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(count),
+                           &count, &size);
+    test_error(error, "Unable to get event info for CL_EVENT_REFERENCE_COUNT!");
+    if (size != sizeof(count))
-        log_error( "ERROR: Returned command type size does not validate (expected %d, got %d)\n", (int)sizeof( type ), (int)size );
+        log_error("ERROR: Returned command type size does not validate "
+                  "(expected %d, got %d)\n",
+                  (int)sizeof(type), (int)size);
         return -1;
     cl_context testCtx;
-    error = clGetEventInfo( event, CL_EVENT_CONTEXT, sizeof( testCtx ), &testCtx, &size );
-    test_error( error, "Unable to get event context info!" );
-    if( size != sizeof( context ) )
+    error = clGetEventInfo(event, CL_EVENT_CONTEXT, sizeof(testCtx), &testCtx,
+                           &size);
+    test_error(error, "Unable to get event context info!");
+    if (size != sizeof(context))
-        log_error( "ERROR: Returned context size does not validate (expected %d, got %d)\n", (int)sizeof( context ), (int)size );
+        log_error("ERROR: Returned context size does not validate (expected "
+                  "%d, got %d)\n",
+                  (int)sizeof(context), (int)size);
         return -1;
-    if( testCtx != context )
+    if (testCtx != context)
-        log_error( "ERROR: Returned context does not match (expected %p, got %p)\n", (void *)context, (void *)testCtx );
+        log_error(
+            "ERROR: Returned context does not match (expected %p, got %p)\n",
+            (void *)context, (void *)testCtx);
         return -1;
@@ -171,10 +204,11 @@
     return 0;
-int test_event_get_write_array_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_get_write_array_status(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
     cl_mem stream;
-    cl_float testArray[ 1024 * 32 ];
+    cl_float testArray[1024 * 32];
     cl_event event;
     int error;
     cl_int status;
@@ -182,34 +216,41 @@
     stream = clCreateBuffer(context, CL_MEM_READ_WRITE,
                             sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
-    error = clEnqueueWriteBuffer(queue, stream, CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)testArray, 0, NULL, &event);
-    test_error( error, "Unable to set testing kernel data" );
+    error = clEnqueueWriteBuffer(queue, stream, CL_FALSE, 0,
+                                 sizeof(cl_float) * 1024 * 32,
+                                 (void *)testArray, 0, NULL, &event);
+    test_error(error, "Unable to set testing kernel data");
     /* Now wait for it to be done */
-    error = clWaitForEvents( 1, &event );
-    test_error( error, "Unable to wait for event" );
+    error = clWaitForEvents(1, &event);
+    test_error(error, "Unable to wait for event");
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus to wait for event completion failed" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error,
+               "Calling clGetEventStatus to wait for event completion failed");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array write complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    clReleaseMemObject( stream );
-    clReleaseEvent( event );
+    clReleaseMemObject(stream);
+    clReleaseEvent(event);
     return 0;
-int test_event_get_read_array_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_get_read_array_status(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements)
     cl_mem stream;
-    cl_float testArray[ 1024 * 32 ];
+    cl_float testArray[1024 * 32];
     cl_event event;
     int error;
     cl_int status;
@@ -217,58 +258,72 @@
     stream = clCreateBuffer(context, CL_MEM_READ_WRITE,
                             sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
-    error = clEnqueueReadBuffer(queue, stream, CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)testArray, 0, NULL, &event);
-    test_error( error, "Unable to get testing kernel data" );
+    error = clEnqueueReadBuffer(queue, stream, CL_FALSE, 0,
+                                sizeof(cl_float) * 1024 * 32, (void *)testArray,
+                                0, NULL, &event);
+    test_error(error, "Unable to get testing kernel data");
     /* It should still be running... */
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
-    if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE)
+    if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "during array read (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
     /* Now wait for it to be done */
-    error = clWaitForEvents( 1, &event );
-    test_error( error, "Unable to wait for event" );
+    error = clWaitForEvents(1, &event);
+    test_error(error, "Unable to wait for event");
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus to wait for event completion failed" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error,
+               "Calling clGetEventStatus to wait for event completion failed");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array read complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    clReleaseMemObject( stream );
-    clReleaseEvent( event );
+    clReleaseMemObject(stream);
+    clReleaseEvent(event);
     return 0;
 /* clGetEventStatus not implemented yet */
-int test_event_wait_for_execute( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_wait_for_execute(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
     cl_int status;
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
     /* Now we wait for it to be done, then test the status again */
-    error = clWaitForEvents( 1, &event );
-    test_error( error, "Unable to wait for execute event" );
+    error = clWaitForEvents(1, &event);
+    test_error(error, "Unable to wait for execute event");
     /* Make sure it worked */
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after event complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
@@ -276,11 +331,12 @@
     return 0;
-int test_event_wait_for_array( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_wait_for_array(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
     cl_mem streams[2];
-    cl_float readArray[ 1024 * 32 ];
-    cl_float writeArray[ 1024 * 32 ];
+    cl_float readArray[1024 * 32];
+    cl_float writeArray[1024 * 32];
     cl_event events[2];
     int error;
     cl_int status;
@@ -288,128 +344,155 @@
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
-    error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)readArray, 0, NULL, &events[0]);
-    test_error( error, "Unable to read testing kernel data" );
+    error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0,
+                                sizeof(cl_float) * 1024 * 32, (void *)readArray,
+                                0, NULL, &events[0]);
+    test_error(error, "Unable to read testing kernel data");
-    error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)writeArray, 0, NULL, &events[1]);
-    test_error( error, "Unable to write testing kernel data" );
+    error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0,
+                                 sizeof(cl_float) * 1024 * 32,
+                                 (void *)writeArray, 0, NULL, &events[1]);
+    test_error(error, "Unable to write testing kernel data");
     /* Both should still be running */
-    error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE)
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "during array read (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE)
+    error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array write (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "during array write (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
     /* Now try waiting for both */
-    error = clWaitForEvents( 2, events );
-    test_error( error, "Unable to wait for array events" );
+    error = clWaitForEvents(2, events);
+    test_error(error, "Unable to wait for array events");
     /* Double check status on both */
-    error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array read complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array write complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    clReleaseMemObject( streams[0] );
-    clReleaseMemObject( streams[1] );
-    clReleaseEvent( events[0] );
-    clReleaseEvent( events[1] );
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseEvent(events[0]);
+    clReleaseEvent(events[1]);
     return 0;
-int test_event_flush( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_flush(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
     int loopCount = 0;
     cl_int status;
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
-    /* Now flush. Note that we can't guarantee this actually lets the op finish, but we can guarantee it's no longer queued */
-    error = clFlush( queue );
-    test_error( error, "Unable to flush events" );
+    /* Now flush. Note that we can't guarantee this actually lets the op finish,
+     * but we can guarantee it's no longer queued */
+    error = clFlush(queue);
+    test_error(error, "Unable to flush events");
     /* Make sure it worked */
-         while (1) {
-        error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS,
-                                                                sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
+    while (1)
+    {
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status), &status, NULL);
+        test_error(error, "Calling clGetEventStatus didn't work!");
-        if( status != CL_QUEUED )
-                  break;
+        if (status != CL_QUEUED) break;
-#if ! defined( _WIN32 )
+#if !defined(_WIN32)
         sleep(1); // give it some time here.
 #else // _WIN32
-            Sleep(1000);
+        Sleep(1000);
-          }
+    }
-CL_QUEUED (command has been enqueued in the command-queue),
-CL_SUBMITTED (enqueued command has been submitted by the host to the device associated with the command-queue),
-CL_RUNNING (device is currently executing this command),
-CL_COMPLETE (the command has completed), or
-Error code given by a negative integer value. (command was abnormally terminated – this may be caused by a bad memory access etc.).
-     if(status != CL_COMPLETE && status != CL_SUBMITTED &&
-        status != CL_RUNNING && status != CL_COMPLETE)
+    /*
+    CL_QUEUED (command has been enqueued in the command-queue),
+    CL_SUBMITTED (enqueued command has been submitted by the host to the device
+    associated with the command-queue), CL_RUNNING (device is currently
+    executing this command), CL_COMPLETE (the command has completed), or Error
+    code given by a negative integer value. (command was abnormally terminated –
+    this may be caused by a bad memory access etc.).
+    */
+    if (status != CL_COMPLETE && status != CL_SUBMITTED && status != CL_RUNNING
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event flush (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after event flush (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
     /* Now wait */
-    error = clFinish( queue );
-    test_error( error, "Unable to finish events" );
+    error = clFinish(queue);
+    test_error(error, "Unable to finish events");
     return 0;
-int test_event_finish_execute( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_finish_execute(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
     cl_int status;
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
     /* Now flush and finish all ops */
-    error = clFinish( queue );
-    test_error( error, "Unable to finish all events" );
+    error = clFinish(queue);
+    test_error(error, "Unable to finish all events");
     /* Make sure it worked */
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after event complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
@@ -417,11 +500,12 @@
     return 0;
-int test_event_finish_array( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_finish_array(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
     cl_mem streams[2];
-    cl_float readArray[ 1024 * 32 ];
-    cl_float writeArray[ 1024 * 32 ];
+    cl_float readArray[1024 * 32];
+    cl_float writeArray[1024 * 32];
     cl_event events[2];
     int error;
     cl_int status;
@@ -429,59 +513,77 @@
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_float) * 1024 * 32, NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
-    error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)readArray, 0, NULL, &events[0]);
-    test_error( error, "Unable to read testing kernel data" );
+    error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0,
+                                sizeof(cl_float) * 1024 * 32, (void *)readArray,
+                                0, NULL, &events[0]);
+    test_error(error, "Unable to read testing kernel data");
-    error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)writeArray, 0, NULL, &events[1]);
-    test_error( error, "Unable to write testing kernel data" );
+    error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0,
+                                 sizeof(cl_float) * 1024 * 32,
+                                 (void *)writeArray, 0, NULL, &events[1]);
+    test_error(error, "Unable to write testing kernel data");
     /* Both should still be running */
-    error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE)
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "during array read (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE)
+    error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED
+        && status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array write (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "during array write (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
     /* Now try finishing all ops */
-    error = clFinish( queue );
-    test_error( error, "Unable to finish all events" );
+    error = clFinish(queue);
+    test_error(error, "Unable to finish all events");
     /* Double check status on both */
-    error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array read complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventStatus didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventStatus didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetErrorStatus "
+                  "after array write complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
-    clReleaseMemObject( streams[0] );
-    clReleaseMemObject( streams[1] );
-    clReleaseEvent( events[0] );
-    clReleaseEvent( events[1] );
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseEvent(events[0]);
+    clReleaseEvent(events[1]);
     return 0;
@@ -489,7 +591,8 @@
 #define NUM_EVENT_RUNS 100
-int test_event_release_before_done( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_release_before_done(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
     // Create a kernel to run
     clProgramWrapper program;
@@ -501,21 +604,24 @@
     int error, i;
     // Create a kernel
-    if( create_single_kernel_helper( context, &program, &kernel[0], 1, sample_long_test_kernel, "sample_test" ) )
+    if (create_single_kernel_helper(context, &program, &kernel[0], 1,
+                                    sample_long_test_kernel, "sample_test"))
         return -1;
-    for( i = 1; i < NUM_EVENT_RUNS; i++ ) {
-       kernel[i] = clCreateKernel(program, "sample_test", &error);
-    test_error(error, "Unable to create kernel");
-  }
+    for (i = 1; i < NUM_EVENT_RUNS; i++)
+    {
+        kernel[i] = clCreateKernel(program, "sample_test", &error);
+        test_error(error, "Unable to create kernel");
+    }
-    error = get_max_common_work_group_size( context, kernel[0], 1024, &threads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error =
+        get_max_common_work_group_size(context, kernel[0], 1024, &threads[0]);
+    test_error(error, "Unable to get work group size to use");
     // Create a set of streams to use as arguments
-    for( i = 0; i < NUM_EVENT_RUNS; i++ )
+    for (i = 0; i < NUM_EVENT_RUNS; i++)
         streams[i][0] =
             clCreateBuffer(context, CL_MEM_READ_WRITE,
@@ -523,77 +629,89 @@
         streams[i][1] =
             clCreateBuffer(context, CL_MEM_READ_WRITE,
                            sizeof(cl_int) * threads[0], NULL, &error);
-        if( ( streams[i][0] == NULL ) || ( streams[i][1] == NULL ) )
+        if ((streams[i][0] == NULL) || (streams[i][1] == NULL))
-            log_error( "ERROR: Unable to allocate testing streams" );
+            log_error("ERROR: Unable to allocate testing streams");
             return -1;
-    // Execute the kernels one by one, hopefully making sure they won't be done by the time we get to the end
-    for( i = 0; i < NUM_EVENT_RUNS; i++ )
+    // Execute the kernels one by one, hopefully making sure they won't be done
+    // by the time we get to the end
+    for (i = 0; i < NUM_EVENT_RUNS; i++)
-        error = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), &streams[i][0] );
-        error |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), &streams[i][1] );
-        test_error( error, "Unable to set kernel arguments" );
+        error = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &streams[i][0]);
+        error |= clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &streams[i][1]);
+        test_error(error, "Unable to set kernel arguments");
-        error = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, threads, threads, 0, NULL, &events[i]);
-        test_error( error, "Unable to execute test kernel" );
+        error = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, threads,
+                                       threads, 0, NULL, &events[i]);
+        test_error(error, "Unable to execute test kernel");
     // Free all but the last event
-    for( i = 0; i < NUM_EVENT_RUNS - 1; i++ )
+    for (i = 0; i < NUM_EVENT_RUNS - 1; i++)
-        clReleaseEvent( events[ i ] );
+        clReleaseEvent(events[i]);
     // Get status on the last one, then free it
-    error = clGetEventInfo( events[ NUM_EVENT_RUNS - 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Unable to get event status" );
+    error = clGetEventInfo(events[NUM_EVENT_RUNS - 1],
+                           CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status),
+                           &status, NULL);
+    test_error(error, "Unable to get event status");
-    clReleaseEvent( events[ NUM_EVENT_RUNS - 1 ] );
+    clReleaseEvent(events[NUM_EVENT_RUNS - 1]);
     // Was the status still-running?
-    if( status == CL_COMPLETE )
+    if (status == CL_COMPLETE)
-        log_info( "WARNING: Events completed before they could be released, so test is a null-op. Increase workload and try again." );
+        log_info("WARNING: Events completed before they could be released, so "
+                 "test is a null-op. Increase workload and try again.");
-    else if( status == CL_RUNNING || status == CL_QUEUED || status == CL_SUBMITTED )
+    else if (status == CL_RUNNING || status == CL_QUEUED
+             || status == CL_SUBMITTED)
-        log_info( "Note: Event status was running or queued when released, so test was good.\n" );
+        log_info("Note: Event status was running or queued when released, so "
+                 "test was good.\n");
     // If we didn't crash by now, the test succeeded
-    clFinish( queue );
+    clFinish(queue);
     return 0;
-int test_event_enqueue_marker( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_enqueue_marker(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
     cl_int status;
-    SETUP_EVENT( context, queue );
+    SETUP_EVENT(context, queue);
-    /* Now we queue a marker and wait for that, which--since it queues afterwards--should guarantee the execute finishes too */
+    /* Now we queue a marker and wait for that, which--since it queues
+     * afterwards--should guarantee the execute finishes too */
     clEventWrapper markerEvent;
-    //error = clEnqueueMarker( queue, &markerEvent );
+    // error = clEnqueueMarker( queue, &markerEvent );
 #ifdef CL_VERSION_1_2
-    error = clEnqueueMarkerWithWaitList(queue, 0, NULL, &markerEvent );
+    error = clEnqueueMarkerWithWaitList(queue, 0, NULL, &markerEvent);
-    error = clEnqueueMarker( queue, &markerEvent );
+    error = clEnqueueMarker(queue, &markerEvent);
-       test_error( error, "Unable to queue marker" );
+    test_error(error, "Unable to queue marker");
     /* Now we wait for it to be done, then test the status again */
-    error = clWaitForEvents( 1, &markerEvent );
-    test_error( error, "Unable to wait for marker event" );
+    error = clWaitForEvents(1, &markerEvent);
+    test_error(error, "Unable to wait for marker event");
     /* Check the status of the first event */
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Calling clGetEventInfo didn't work!" );
-    if( status != CL_COMPLETE )
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "Calling clGetEventInfo didn't work!");
+    if (status != CL_COMPLETE)
-        log_error( "ERROR: Incorrect status returned from clGetEventInfo after event complete (%d:%s)\n", status, IGetStatusString( status ) );
+        log_error("ERROR: Incorrect status returned from clGetEventInfo after "
+                  "event complete (%d:%s)\n",
+                  status, IGetStatusString(status));
         return -1;
@@ -602,85 +720,101 @@
 #ifdef CL_VERSION_1_2
-int test_event_enqueue_marker_with_event_list( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_enqueue_marker_with_event_list(cl_device_id deviceID,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements)
+    SETUP_EVENT(context, queue);
+    cl_event event_list[3] = { NULL, NULL, NULL };
-    cl_int status;
-    SETUP_EVENT( context, queue );
-    cl_event event_list[3]={ NULL, NULL, NULL};
+    size_t threads[1] = { 10 }, localThreads[1] = { 1 };
+    cl_uint event_count = 2;
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[0]);
+    test_error(error, " clEnqueueMarkerWithWaitList   1 ");
-    size_t threads[1] = { 10 }, localThreads[1]={1};
-    cl_uint event_count=2;
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]);
-      test_error( error, " clEnqueueMarkerWithWaitList   1 " );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[1]);
+    test_error(error, " clEnqueueMarkerWithWaitList 2");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]);
-      test_error( error, " clEnqueueMarkerWithWaitList 2" );
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, NULL);
-      test_error( error, " clEnqueueMarkerWithWaitList  3" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, " clEnqueueMarkerWithWaitList  3");
     // test the case event returned
-    error =clEnqueueMarkerWithWaitList(queue, event_count, event_list,  &event_list[2]);
-      test_error( error, " clEnqueueMarkerWithWaitList " );
+    error = clEnqueueMarkerWithWaitList(queue, event_count, event_list,
+                                        &event_list[2]);
+    test_error(error, " clEnqueueMarkerWithWaitList ");
     error = clReleaseEvent(event_list[0]);
     error |= clReleaseEvent(event_list[1]);
-    test_error( error, "clReleaseEvent" );
+    test_error(error, "clReleaseEvent");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]);
-    test_error( error, " clEnqueueMarkerWithWaitList   1 -1 " );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[0]);
+    test_error(error, " clEnqueueMarkerWithWaitList   1 -1 ");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]);
-    test_error( error, " clEnqueueMarkerWithWaitList  2-2" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[1]);
+    test_error(error, " clEnqueueMarkerWithWaitList  2-2");
-    // test the case event =NULL,   caused [CL_INVALID_VALUE] : OpenCL Error : clEnqueueMarkerWithWaitList failed: event is a NULL value
-    error =clEnqueueMarkerWithWaitList(queue, event_count, event_list,  NULL);
-    test_error( error, " clEnqueueMarkerWithWaitList " );
+    // test the case event =NULL,   caused [CL_INVALID_VALUE] : OpenCL Error :
+    // clEnqueueMarkerWithWaitList failed: event is a NULL value
+    error = clEnqueueMarkerWithWaitList(queue, event_count, event_list, NULL);
+    test_error(error, " clEnqueueMarkerWithWaitList ");
     error = clReleaseEvent(event_list[0]);
     error |= clReleaseEvent(event_list[1]);
     error |= clReleaseEvent(event_list[2]);
-    test_error( error, "clReleaseEvent" );
+    test_error(error, "clReleaseEvent");
     return 0;
-int test_event_enqueue_barrier_with_event_list( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_event_enqueue_barrier_with_event_list(cl_device_id deviceID,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements)
+    SETUP_EVENT(context, queue);
+    cl_event event_list[3] = { NULL, NULL, NULL };
-    cl_int status;
-    SETUP_EVENT( context, queue );
-    cl_event event_list[3]={ NULL, NULL, NULL};
+    size_t threads[1] = { 10 }, localThreads[1] = { 1 };
+    cl_uint event_count = 2;
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[0]);
+    test_error(error, " clEnqueueBarrierWithWaitList   1 ");
-    size_t threads[1] = { 10 }, localThreads[1]={1};
-    cl_uint event_count=2;
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]);
-    test_error( error, " clEnqueueBarrierWithWaitList   1 " );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[1]);
+    test_error(error, " clEnqueueBarrierWithWaitList 2");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]);
-    test_error( error, " clEnqueueBarrierWithWaitList 2" );
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, NULL);
-    test_error( error, " clEnqueueBarrierWithWaitList  20" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, " clEnqueueBarrierWithWaitList  20");
     // test the case event returned
-    error =clEnqueueBarrierWithWaitList(queue, event_count, event_list,  &event_list[2]);
-    test_error( error, " clEnqueueBarrierWithWaitList " );
+    error = clEnqueueBarrierWithWaitList(queue, event_count, event_list,
+                                         &event_list[2]);
+    test_error(error, " clEnqueueBarrierWithWaitList ");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]);
-    test_error( error, " clEnqueueBarrierWithWaitList   1 " );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[0]);
+    test_error(error, " clEnqueueBarrierWithWaitList   1 ");
-    error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]);
-    test_error( error, " clEnqueueBarrierWithWaitList 2" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event_list[1]);
+    test_error(error, " clEnqueueBarrierWithWaitList 2");
-    // test the case event =NULL,   caused [CL_INVALID_VALUE] : OpenCL Error : clEnqueueMarkerWithWaitList failed: event is a NULL value
-    error = clEnqueueBarrierWithWaitList(queue, event_count, event_list,  NULL);
-    test_error( error, " clEnqueueBarrierWithWaitList " );
+    // test the case event =NULL,   caused [CL_INVALID_VALUE] : OpenCL Error :
+    // clEnqueueMarkerWithWaitList failed: event is a NULL value
+    error = clEnqueueBarrierWithWaitList(queue, event_count, event_list, NULL);
+    test_error(error, " clEnqueueBarrierWithWaitList ");
diff --git a/test_conformance/events/test_userevents.cpp b/test_conformance/events/test_userevents.cpp
index 0a4954f..1fdb4ea 100644
--- a/test_conformance/events/test_userevents.cpp
+++ b/test_conformance/events/test_userevents.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -14,11 +14,11 @@
 // limitations under the License.
 #if defined(__APPLE__)
-    #include <OpenCL/opencl.h>
-    #include <mach/mach_time.h>
+#include <OpenCL/opencl.h>
+#include <mach/mach_time.h>
-    #include <CL/cl.h>
-  #include <malloc.h>
+#include <CL/cl.h>
+#include <malloc.h>
 #include <assert.h>
 #include <stdio.h>
@@ -29,189 +29,261 @@
 // CL error checking.
 #if defined(_MSC_VER)
-#define CL_EXIT_ERROR(cmd,...) \
-{ \
-if ((cmd) != CL_SUCCESS) { \
-log_error("CL ERROR: %s %u: ", __FILE__,__LINE__);\
-log_error(## __VA_ARGS__ );\
-return -1;\
+#define CL_EXIT_ERROR(cmd, ...)                                                \
+    {                                                                          \
+        if ((cmd) != CL_SUCCESS)                                               \
+        {                                                                      \
+            log_error("CL ERROR: %s %u: ", __FILE__, __LINE__);                \
+            log_error(##__VA_ARGS__);                                          \
+            log_error("\n");                                                   \
+            return -1;                                                         \
+        }                                                                      \
+    }
-#define CL_EXIT_ERROR(cmd,format,...) \
-{ \
-if ((cmd) != CL_SUCCESS) { \
-log_error("CL ERROR: %s %u: ", __FILE__,__LINE__);\
-log_error(format,## __VA_ARGS__ );\
-return -1;\
+#define CL_EXIT_ERROR(cmd, format, ...)                                        \
+    {                                                                          \
+        if ((cmd) != CL_SUCCESS)                                               \
+        {                                                                      \
+            log_error("CL ERROR: %s %u: ", __FILE__, __LINE__);                \
+            log_error(format, ##__VA_ARGS__);                                  \
+            log_error("\n");                                                   \
+            return -1;                                                         \
+        }                                                                      \
+    }
-#define CL_EXIT_BUILD_ERROR(cmd,program,format,...) \
-{ \
-if ((cmd) != CL_SUCCESS) { \
-cl_uint num_devices_;\
-cl_device_id *device_list;\
-device_list=(cl_device_id *)malloc(num_devices_*sizeof(cl_device_id));\
-for (unsigned i=0;i<num_devices_;++i) {\
-size_t len;\
-char buffer[2048];\
-log_error("DEVICE %u CL BUILD ERROR: %s(%u): ",i,__FILE__,__LINE__);\
-log_error(format,## __VA_ARGS__ );\
-return -1;\
+#define CL_EXIT_BUILD_ERROR(cmd, program, format, ...)                         \
+    {                                                                          \
+        if ((cmd) != CL_SUCCESS)                                               \
+        {                                                                      \
+            cl_uint num_devices_;                                              \
+            clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,                  \
+                             sizeof(num_devices_), &num_devices_, NULL);       \
+            cl_device_id *device_list;                                         \
+            device_list =                                                      \
+                (cl_device_id *)malloc(num_devices_ * sizeof(cl_device_id));   \
+            clGetProgramInfo(program, CL_PROGRAM_DEVICES,                      \
+                             num_devices_ * sizeof(cl_device_id), device_list, \
+                             NULL);                                            \
+            for (unsigned i = 0; i < num_devices_; ++i)                        \
+            {                                                                  \
+                size_t len;                                                    \
+                char buffer[2048];                                             \
+                clGetProgramBuildInfo(program, device_list[i],                 \
+                                      CL_PROGRAM_BUILD_LOG, sizeof(buffer),    \
+                                      buffer, &len);                           \
+                log_error("DEVICE %u CL BUILD ERROR: %s(%u): ", i, __FILE__,   \
+                          __LINE__);                                           \
+                log_error(format, ##__VA_ARGS__);                              \
+                log_error("\n");                                               \
+            }                                                                  \
+            free(device_list);                                                 \
+            return -1;                                                         \
+        }                                                                      \
+    }
-const char* src[] = {
-  "__kernel void simple_task(__global float* output) {\n"
-  "  output[0] += 1;\n"
-  "}\n"
+const char *src[] = { "__kernel void simple_task(__global float* output) {\n"
+                      "  output[0] += 1;\n"
+                      "}\n" };
+    MaxDevices = 8
-enum { MaxDevices = 8 };
-int test_userevents( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_userevents(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
-  cl_int err;
+    cl_int err;
-  cl_event u1 = clCreateUserEvent( context, &err );
-  CL_EXIT_ERROR(err,"clCreateUserEvent failed");
+    cl_event u1 = clCreateUserEvent(context, &err);
+    CL_EXIT_ERROR(err, "clCreateUserEvent failed");
-  // Test event properties.
-  cl_int s;
-  size_t sizeofs;
-  CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof s, &s, &sizeofs),"clGetEventInfo failed");
-  CL_EXIT_ERROR((sizeof s == sizeofs) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_EXECUTION_STATUS");
-  CL_EXIT_ERROR((s == CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_EXECUTION_STATUS");
+    // Test event properties.
+    cl_int s;
+    size_t sizeofs;
+                                 sizeof s, &s, &sizeofs),
+                  "clGetEventInfo failed");
+    CL_EXIT_ERROR((sizeof s == sizeofs) ? CL_SUCCESS : -1,
+                  "clGetEventInfo returned wrong size for "
+                  "clGetEventInfo returned wrong value for "
-  cl_command_type t;
-  size_t sizeoft;
-  CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_TYPE, sizeof t, &t, &sizeoft),"clGetEventInfo failed");
-  CL_EXIT_ERROR((sizeof t == sizeoft) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_TYPE");
-  CL_EXIT_ERROR((t == CL_COMMAND_USER) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_TYPE");
+    cl_command_type t;
+    size_t sizeoft;
+        clGetEventInfo(u1, CL_EVENT_COMMAND_TYPE, sizeof t, &t, &sizeoft),
+        "clGetEventInfo failed");
+        (sizeof t == sizeoft) ? CL_SUCCESS : -1,
+        "clGetEventInfo returned wrong size for CL_EVENT_COMMAND_TYPE");
+        (t == CL_COMMAND_USER) ? CL_SUCCESS : -1,
+        "clGetEventInfo returned wrong value for CL_EVENT_COMMAND_TYPE");
-  cl_command_queue q;
-  size_t sizeofq;
-  CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_QUEUE, sizeof q, &q, &sizeofq),"clGetEventInfo failed");
-  CL_EXIT_ERROR((sizeof q == sizeofq) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_QUEUE");
-  CL_EXIT_ERROR((q == NULL) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_QUEUE");
+    cl_command_queue q;
+    size_t sizeofq;
+        clGetEventInfo(u1, CL_EVENT_COMMAND_QUEUE, sizeof q, &q, &sizeofq),
+        "clGetEventInfo failed");
+        (sizeof q == sizeofq) ? CL_SUCCESS : -1,
+        "clGetEventInfo returned wrong size for CL_EVENT_COMMAND_QUEUE");
+        (q == NULL) ? CL_SUCCESS : -1,
+        "clGetEventInfo returned wrong value for CL_EVENT_COMMAND_QUEUE");
-  cl_context c;
-  size_t sizeofc;
-  CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_CONTEXT, sizeof c, &c, &sizeofc),"clGetEventInfo failed");
-  CL_EXIT_ERROR((sizeof c == sizeofc) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_CONTEXT");
-  CL_EXIT_ERROR((c == context) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_CONTEXT");
+    cl_context c;
+    size_t sizeofc;
+    CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_CONTEXT, sizeof c, &c, &sizeofc),
+                  "clGetEventInfo failed");
+    CL_EXIT_ERROR((sizeof c == sizeofc) ? CL_SUCCESS : -1,
+                  "clGetEventInfo returned wrong size for CL_EVENT_CONTEXT");
+    CL_EXIT_ERROR((c == context) ? CL_SUCCESS : -1,
+                  "clGetEventInfo returned wrong value for CL_EVENT_CONTEXT");
-  cl_ulong p;
-  err = clGetEventProfilingInfo(u1,CL_PROFILING_COMMAND_QUEUED,sizeof p,&p,0);
-  CL_EXIT_ERROR((err != CL_SUCCESS) ? CL_SUCCESS : -1,"clGetEventProfilingInfo returned wrong error.");
+    cl_ulong p;
+    err = clGetEventProfilingInfo(u1, CL_PROFILING_COMMAND_QUEUED, sizeof p, &p,
+                                  0);
+                  "clGetEventProfilingInfo returned wrong error.");
-  // Test semantics.
-  cl_program program;
-  err = create_single_kernel_helper_create_program(context, &program, 1, src);
-  CL_EXIT_ERROR(err,"clCreateProgramWithSource failed");
+    // Test semantics.
+    cl_program program;
+    err = create_single_kernel_helper_create_program(context, &program, 1, src);
+    CL_EXIT_ERROR(err, "clCreateProgramWithSource failed");
-  CL_EXIT_BUILD_ERROR(clBuildProgram(program,0,NULL,"",NULL,NULL),program,"Building program from inline src:\t%s",src[0]);
+    CL_EXIT_BUILD_ERROR(clBuildProgram(program, 0, NULL, "", NULL, NULL),
+                        program, "Building program from inline src:\t%s",
+                        src[0]);
-  cl_kernel k0 = clCreateKernel(program,"simple_task",&err);
-  CL_EXIT_ERROR(err,"clCreateKernel failed");
+    cl_kernel k0 = clCreateKernel(program, "simple_task", &err);
+    CL_EXIT_ERROR(err, "clCreateKernel failed");
-  float buffer[1];
-  cl_mem output = clCreateBuffer(context,CL_MEM_USE_HOST_PTR,sizeof buffer, buffer, &err);
-  CL_EXIT_ERROR(err,"clCreateBuffer failed.");
+    float buffer[1];
+    cl_mem output = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof buffer,
+                                   buffer, &err);
+    CL_EXIT_ERROR(err, "clCreateBuffer failed.");
-  CL_EXIT_ERROR(clSetKernelArg(k0,0,sizeof(output),&output),"clSetKernelArg failed");
+    CL_EXIT_ERROR(clSetKernelArg(k0, 0, sizeof(output), &output),
+                  "clSetKernelArg failed");
-  // Successful case. //////////////////////////////////////////////////////////////////////////////////////
-  {
-    cl_event e[4];
-    cl_uint  N = sizeof e / sizeof(cl_event);
+    // Successful case.
+    // //////////////////////////////////////////////////////////////////////////////////////
+    {
+        cl_event e[4];
+        cl_uint N = sizeof e / sizeof(cl_event);
-    log_info("Enqueuing tasks\n");
-    for (cl_uint i = 0; i != N; ++i)
-      CL_EXIT_ERROR(clEnqueueTask(queue,k0,1,&u1,&e[i]),"clEnqueueTaskFailed");
+        log_info("Enqueuing tasks\n");
+        for (cl_uint i = 0; i != N; ++i)
+            CL_EXIT_ERROR(clEnqueueTask(queue, k0, 1, &u1, &e[i]),
+                          "clEnqueueTaskFailed");
-    log_info("Checking task status before setting user event status\n");
-    for (cl_uint i = 0; i != N; ++i) {
-      CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed");
-      CL_EXIT_ERROR((s >= CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status before user event",i);
+        log_info("Checking task status before setting user event status\n");
+        for (cl_uint i = 0; i != N; ++i)
+        {
+            CL_EXIT_ERROR(clGetEventInfo(e[i],
+                                         CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                         sizeof s, &s, 0),
+                          "clGetEventInfo failed");
+            CL_EXIT_ERROR(
+                (s >= CL_SUBMITTED) ? CL_SUCCESS : -1,
+                "clGetEventInfo %u returned wrong status before user event", i);
+        }
+        log_info("Setting user event status to complete\n");
+        CL_EXIT_ERROR(clSetUserEventStatus(u1, CL_COMPLETE),
+                      "clSetUserEventStatus failed");
+        log_info("Waiting for tasks to finish executing\n");
+        CL_EXIT_ERROR(clWaitForEvents(1, &e[N - 1]), "clWaitForEvent failed");
+        log_info("Checking task status after setting user event status\n");
+        for (cl_uint i = 0; i != N; ++i)
+        {
+            CL_EXIT_ERROR(clGetEventInfo(e[i],
+                                         CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                         sizeof s, &s, 0),
+                          "clGetEventInfo failed");
+            CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,
+                          "clGetEventInfo %u returned wrong status %04x after "
+                          "successful user event",
+                          i, s);
+        }
+        CL_EXIT_ERROR(clReleaseEvent(u1), "clReleaseEvent failed");
+        for (cl_uint i = 0; i != N; ++i)
+            CL_EXIT_ERROR(clReleaseEvent(e[i]), "clReleaseEvent failed");
+        log_info("Successful user event case passed.\n");
-    log_info("Setting user event status to complete\n");
-    CL_EXIT_ERROR(clSetUserEventStatus(u1,CL_COMPLETE),"clSetUserEventStatus failed");
+    // Test unsuccessful user event case.
+    // ///////////////////////////////////////////////////////////////////
+    {
+        cl_event u2 = clCreateUserEvent(context, &err);
+        CL_EXIT_ERROR(err, "clCreateUserEvent failed");
-    log_info("Waiting for tasks to finish executing\n");
-    CL_EXIT_ERROR(clWaitForEvents( 1, &e[N-1] ),"clWaitForEvent failed");
+        cl_event e[4];
+        cl_uint N = sizeof e / sizeof(cl_event);
-    log_info("Checking task status after setting user event status\n");
-    for (cl_uint i = 0; i != N; ++i) {
-      CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed");
-      CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %04x after successful user event",i,s);
+        log_info("Enqueuing tasks\n");
+        for (cl_uint i = 0; i != N; ++i)
+            CL_EXIT_ERROR(clEnqueueTask(queue, k0, 1, &u2, &e[i]),
+                          "clEnqueueTaskFailed");
+        log_info("Checking task status before setting user event status\n");
+        for (cl_uint i = 0; i != N; ++i)
+        {
+            CL_EXIT_ERROR(clGetEventInfo(e[i],
+                                         CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                         sizeof s, &s, 0),
+                          "clGetEventInfo failed");
+            CL_EXIT_ERROR(
+                (s == CL_QUEUED || s == CL_SUBMITTED) ? CL_SUCCESS : -1,
+                "clGetEventInfo %u returned wrong status %d before user event",
+                i, (int)s);
+        }
+        log_info("Setting user event status to unsuccessful result\n");
+        CL_EXIT_ERROR(clSetUserEventStatus(u2, -1),
+                      "clSetUserEventStatus failed");
+        log_info("Waiting for tasks to finish executing\n");
+        CL_EXIT_ERROR((clWaitForEvents(N, &e[0]) != CL_SUCCESS) ? CL_SUCCESS
+                                                                : -1,
+                      "clWaitForEvent succeeded when it should have failed");
+        log_info("Checking task status after setting user event status\n");
+        for (cl_uint i = 0; i != N; ++i)
+        {
+            CL_EXIT_ERROR(clGetEventInfo(e[i],
+                                         CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                         sizeof s, &s, 0),
+                          "clGetEventInfo failed");
+            CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,
+                          "clGetEventInfo %u returned wrong status %04x after "
+                          "unsuccessful user event",
+                          i, s);
+        }
+        CL_EXIT_ERROR(clReleaseEvent(u2), "clReleaseEvent failed");
+        for (cl_uint i = 0; i != N; ++i)
+            CL_EXIT_ERROR(clReleaseEvent(e[i]), "clReleaseEvent failed");
+        log_info("Unsuccessful user event case passed.\n");
-    CL_EXIT_ERROR(clReleaseEvent(u1),"clReleaseEvent failed");
+    clReleaseKernel(k0);
+    clReleaseProgram(program);
+    clReleaseMemObject(output);
-    for (cl_uint i = 0; i != N; ++i)
-      CL_EXIT_ERROR(clReleaseEvent(e[i]),"clReleaseEvent failed");
-    log_info("Successful user event case passed.\n");
-  }
-  // Test unsuccessful user event case. ///////////////////////////////////////////////////////////////////
-  {
-    cl_event u2 = clCreateUserEvent( context, &err );
-    CL_EXIT_ERROR(err,"clCreateUserEvent failed");
-    cl_event e[4];
-    cl_uint  N = sizeof e / sizeof(cl_event);
-    log_info("Enqueuing tasks\n");
-    for (cl_uint i = 0; i != N; ++i)
-      CL_EXIT_ERROR(clEnqueueTask(queue,k0,1,&u2,&e[i]),"clEnqueueTaskFailed");
-    log_info("Checking task status before setting user event status\n");
-    for (cl_uint i = 0; i != N; ++i) {
-      CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed");
-      CL_EXIT_ERROR((s == CL_QUEUED || s == CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %d before user event",i, (int) s);
-    }
-    log_info("Setting user event status to unsuccessful result\n");
-    CL_EXIT_ERROR(clSetUserEventStatus(u2,-1),"clSetUserEventStatus failed");
-    log_info("Waiting for tasks to finish executing\n");
-    CL_EXIT_ERROR((clWaitForEvents( N, &e[0] )!=CL_SUCCESS) ? CL_SUCCESS : -1,"clWaitForEvent succeeded when it should have failed");
-    log_info("Checking task status after setting user event status\n");
-    for (cl_uint i = 0; i != N; ++i) {
-      CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed");
-      CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %04x after unsuccessful user event",i,s);
-    }
-    CL_EXIT_ERROR(clReleaseEvent(u2),"clReleaseEvent failed");
-    for (cl_uint i = 0; i != N; ++i)
-      CL_EXIT_ERROR(clReleaseEvent(e[i]),"clReleaseEvent failed");
-    log_info("Unsuccessful user event case passed.\n");
-  }
-  clReleaseKernel(k0);
-  clReleaseProgram(program);
-  clReleaseMemObject(output);
-  return 0;
+    return 0;
diff --git a/test_conformance/events/test_userevents_multithreaded.cpp b/test_conformance/events/test_userevents_multithreaded.cpp
index 51ef222..a7845bf 100644
--- a/test_conformance/events/test_userevents_multithreaded.cpp
+++ b/test_conformance/events/test_userevents_multithreaded.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,8 +19,8 @@
 #include <thread>
-#if !defined (_MSC_VER)
-    #include <unistd.h>
+#if !defined(_MSC_VER)
+#include <unistd.h>
 #endif // !_MSC_VER
 void trigger_user_event(cl_event *event)
@@ -30,44 +30,44 @@
     clSetUserEventStatus(*event, CL_COMPLETE);
-int test_userevents_multithreaded( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_userevents_multithreaded(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
     cl_int error;
     // Set up a user event to act as a gate
-    clEventWrapper gateEvent = clCreateUserEvent( context, &error );
-    test_error( error, "Unable to create user gate event" );
+    clEventWrapper gateEvent = clCreateUserEvent(context, &error);
+    test_error(error, "Unable to create user gate event");
     // Set up a few actions gated on the user event
     NDRangeKernelAction action1;
     ReadBufferAction action2;
     WriteBufferAction action3;
-    clEventWrapper actionEvents[ 3 ];
-    Action * actions[] = { &action1, &action2, &action3, NULL };
+    clEventWrapper actionEvents[3];
+    Action *actions[] = { &action1, &action2, &action3, NULL };
-    for( int i = 0; actions[ i ] != NULL; i++ )
+    for (int i = 0; actions[i] != NULL; i++)
-        error = actions[ i ]->Setup( deviceID, context, queue );
-        test_error( error, "Unable to set up test action" );
+        error = actions[i]->Setup(deviceID, context, queue);
+        test_error(error, "Unable to set up test action");
-        error = actions[ i ]->Execute( queue, 1, &gateEvent, &actionEvents[ i ] );
-        test_error( error, "Unable to execute test action" );
+        error = actions[i]->Execute(queue, 1, &gateEvent, &actionEvents[i]);
+        test_error(error, "Unable to execute test action");
     // Now, instead of releasing the gate, we spawn a separate thread to do so
-    log_info( "\tStarting trigger thread...\n" );
+    log_info("\tStarting trigger thread...\n");
     std::thread thread(trigger_user_event, &gateEvent);
-    log_info( "\tWaiting for actions...\n" );
-    error = clWaitForEvents( 3, &actionEvents[ 0 ] );
-    test_error( error, "Unable to wait for action events" );
+    log_info("\tWaiting for actions...\n");
+    error = clWaitForEvents(3, &actionEvents[0]);
+    test_error(error, "Unable to wait for action events");
-    log_info( "\tActions completed.\n" );
+    log_info("\tActions completed.\n");
     // If we got here without error, we're good
     return 0;
diff --git a/test_conformance/events/test_waitlists.cpp b/test_conformance/events/test_waitlists.cpp
index e23cacf..6036451 100644
--- a/test_conformance/events/test_waitlists.cpp
+++ b/test_conformance/events/test_waitlists.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,306 +17,374 @@
 #include "action_classes.h"
-extern const char *IGetStatusString( cl_int status );
+extern const char *IGetStatusString(cl_int status);
 #define PRINT_OPS 0
-int test_waitlist( cl_device_id device, cl_context context, cl_command_queue queue, Action *actionToTest, bool multiple )
+int test_waitlist(cl_device_id device, cl_context context,
+                  cl_command_queue queue, Action *actionToTest, bool multiple)
-    NDRangeKernelAction    actions[ 2 ];
-    clEventWrapper events[ 3 ];
-    cl_int status[ 3 ];
+    NDRangeKernelAction actions[2];
+    clEventWrapper events[3];
+    cl_int status[3];
     cl_int error;
-  if (multiple)
-    log_info("\tExecuting reference event 0, then reference event 1 with reference event 0 in its waitlist, then test event 2 with reference events 0 and 1 in its waitlist.\n");
-  else
-    log_info("\tExecuting reference event 0, then test event 2 with reference event 0 in its waitlist.\n");
+    if (multiple)
+        log_info("\tExecuting reference event 0, then reference event 1 with "
+                 "reference event 0 in its waitlist, then test event 2 with "
+                 "reference events 0 and 1 in its waitlist.\n");
+    else
+        log_info("\tExecuting reference event 0, then test event 2 with "
+                 "reference event 0 in its waitlist.\n");
     // Set up the first base action to wait against
-    error = actions[ 0 ].Setup( device, context, queue );
-    test_error( error, "Unable to setup base event to wait against" );
+    error = actions[0].Setup(device, context, queue);
+    test_error(error, "Unable to setup base event to wait against");
-    if( multiple )
+    if (multiple)
         // Set up a second event to wait against
-        error = actions[ 1 ].Setup( device, context, queue );
-        test_error( error, "Unable to setup second base event to wait against" );
+        error = actions[1].Setup(device, context, queue);
+        test_error(error, "Unable to setup second base event to wait against");
     // Now set up the actual action to test
-    error = actionToTest->Setup( device, context, queue );
-    test_error( error, "Unable to set up test event" );
+    error = actionToTest->Setup(device, context, queue);
+    test_error(error, "Unable to set up test event");
     // Execute all events now
-  if (PRINT_OPS) log_info("\tExecuting action 0...\n");
-    error = actions[ 0 ].Execute( queue, 0, NULL, &events[ 0 ] );
-    test_error( error, "Unable to execute first event" );
+    if (PRINT_OPS) log_info("\tExecuting action 0...\n");
+    error = actions[0].Execute(queue, 0, NULL, &events[0]);
+    test_error(error, "Unable to execute first event");
-    if( multiple )
+    if (multiple)
-    if (PRINT_OPS) log_info("\tExecuting action 1...\n");
-        error = actions[ 1 ].Execute( queue, 1, &events[0], &events[ 1 ] );
-        test_error( error, "Unable to execute second event" );
+        if (PRINT_OPS) log_info("\tExecuting action 1...\n");
+        error = actions[1].Execute(queue, 1, &events[0], &events[1]);
+        test_error(error, "Unable to execute second event");
     // Sanity check
-  if( multiple ) {
-    if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
-        error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL );
-    test_error( error, "Unable to get event status" );
-  }
-  if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
-    error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL );
-  test_error( error, "Unable to get event status" );
-  log_info("\t\tEvent status after starting reference events: reference event 0: %s, reference event 1: %s, test event 2: %s.\n",
-           IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), "N/A");
-    if( ( status[ 0 ] == CL_COMPLETE ) || ( multiple && status[ 1 ] == CL_COMPLETE ) )
+    if (multiple)
-        log_info( "WARNING: Reference event(s) already completed before we could execute test event! Possible that the reference event blocked (implicitly passing)\n" );
+        if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
+        error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status[1]), &status[1], NULL);
+        test_error(error, "Unable to get event status");
+    }
+    if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status[0]), &status[0], NULL);
+    test_error(error, "Unable to get event status");
+    log_info("\t\tEvent status after starting reference events: reference "
+             "event 0: %s, reference event 1: %s, test event 2: %s.\n",
+             IGetStatusString(status[0]),
+             (multiple ? IGetStatusString(status[1]) : "N/A"), "N/A");
+    if ((status[0] == CL_COMPLETE) || (multiple && status[1] == CL_COMPLETE))
+    {
+        log_info("WARNING: Reference event(s) already completed before we "
+                 "could execute test event! Possible that the reference event "
+                 "blocked (implicitly passing)\n");
         return 0;
-  if (PRINT_OPS) log_info("\tExecuting action to test...\n");
-    error = actionToTest->Execute( queue, ( multiple ) ? 2 : 1, &events[ 0 ], &events[ 2 ] );
-    test_error( error, "Unable to execute test event" );
+    if (PRINT_OPS) log_info("\tExecuting action to test...\n");
+    error = actionToTest->Execute(queue, (multiple) ? 2 : 1, &events[0],
+                                  &events[2]);
+    test_error(error, "Unable to execute test event");
     // Hopefully, the first event is still running
-  if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
-    error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL );
-    test_error( error, "Unable to get event status" );
-  if( multiple ) {
-    if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
-        error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL );
-    test_error( error, "Unable to get event status" );
-  }
-  if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
-    error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL );
-  test_error( error, "Unable to get event status" );
-  log_info("\t\tEvent status after starting test event: reference event 0: %s, reference event 1: %s, test event 2: %s.\n",
-           IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] ));
-    if( multiple )
+    if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
+    error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status[2]), &status[2], NULL);
+    test_error(error, "Unable to get event status");
+    if (multiple)
-        if( status[ 0 ] == CL_COMPLETE && status[ 1 ] == CL_COMPLETE )
+        if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
+        error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status[1]), &status[1], NULL);
+        test_error(error, "Unable to get event status");
+    }
+    if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status[0]), &status[0], NULL);
+    test_error(error, "Unable to get event status");
+    log_info("\t\tEvent status after starting test event: reference event 0: "
+             "%s, reference event 1: %s, test event 2: %s.\n",
+             IGetStatusString(status[0]),
+             (multiple ? IGetStatusString(status[1]) : "N/A"),
+             IGetStatusString(status[2]));
+    if (multiple)
+    {
+        if (status[0] == CL_COMPLETE && status[1] == CL_COMPLETE)
-            log_info( "WARNING: Both events completed, so unable to test further (implicitly passing).\n" );
-            clFinish( queue );
+            log_info("WARNING: Both events completed, so unable to test "
+                     "further (implicitly passing).\n");
+            clFinish(queue);
             return 0;
-    if(status[1] == CL_COMPLETE && status[0] != CL_COMPLETE)
-    {
-      log_error("ERROR: Test failed because the second wait event is complete and the first is not.(status: 0: %s and 1: %s)\n", IGetStatusString( status[ 0 ] ), IGetStatusString( status[ 1 ] ) );
-            clFinish( queue );
+        if (status[1] == CL_COMPLETE && status[0] != CL_COMPLETE)
+        {
+            log_error(
+                "ERROR: Test failed because the second wait event is complete "
+                "and the first is not.(status: 0: %s and 1: %s)\n",
+                IGetStatusString(status[0]), IGetStatusString(status[1]));
+            clFinish(queue);
             return -1;
-    }
+        }
-        if( status[ 0 ] == CL_COMPLETE )
+        if (status[0] == CL_COMPLETE)
-            log_info( "WARNING: Reference event completed, so unable to test further (implicitly passing).\n" );
-            clFinish( queue );
+            log_info("WARNING: Reference event completed, so unable to test "
+                     "further (implicitly passing).\n");
+            clFinish(queue);
             return 0;
-        if( status[ 0 ] != CL_RUNNING && status[ 0 ] != CL_QUEUED && status[ 0 ] != CL_SUBMITTED )
+        if (status[0] != CL_RUNNING && status[0] != CL_QUEUED
+            && status[0] != CL_SUBMITTED)
-            log_error( "ERROR: Test failed because first wait event is not currently running, queued, or submitted! (status: 0: %s)\n", IGetStatusString( status[ 0 ] ) );
-            clFinish( queue );
+            log_error(
+                "ERROR: Test failed because first wait event is not currently "
+                "running, queued, or submitted! (status: 0: %s)\n",
+                IGetStatusString(status[0]));
+            clFinish(queue);
             return -1;
-    if( status[ 2 ] != CL_QUEUED && status[ 2 ] != CL_SUBMITTED )
+    if (status[2] != CL_QUEUED && status[2] != CL_SUBMITTED)
-        log_error( "ERROR: Test event is not waiting to run! (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) );
-        clFinish( queue );
+        log_error("ERROR: Test event is not waiting to run! (status: 2: %s)\n",
+                  IGetStatusString(status[2]));
+        clFinish(queue);
         return -1;
     // Now wait for the first reference event
-  if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n");
-    error = clWaitForEvents( 1, &events[ 0 ] );
-    test_error( error, "Unable to wait for reference event" );
+    if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n");
+    error = clWaitForEvents(1, &events[0]);
+    test_error(error, "Unable to wait for reference event");
     // Grab statuses again
-  if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
-    error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL );
-    test_error( error, "Unable to get event status" );
-  if( multiple ) {
-    if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
-        error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL );
-    test_error( error, "Unable to get event status" );
-  }
-  if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
-    error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL );
-  test_error( error, "Unable to get event status" );
+    if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
+    error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status[2]), &status[2], NULL);
+    test_error(error, "Unable to get event status");
+    if (multiple)
+    {
+        if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
+        error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status[1]), &status[1], NULL);
+        test_error(error, "Unable to get event status");
+    }
+    if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
+    error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status[0]), &status[0], NULL);
+    test_error(error, "Unable to get event status");
-  log_info("\t\tEvent status after waiting for reference event 0: reference event 0: %s, reference event 1: %s, test event 2: %s.\n",
-           IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] ));
+    log_info("\t\tEvent status after waiting for reference event 0: reference "
+             "event 0: %s, reference event 1: %s, test event 2: %s.\n",
+             IGetStatusString(status[0]),
+             (multiple ? IGetStatusString(status[1]) : "N/A"),
+             IGetStatusString(status[2]));
     // Sanity
-    if( status[ 0 ] != CL_COMPLETE )
+    if (status[0] != CL_COMPLETE)
-        log_error( "ERROR: Waited for first event but it's not complete (status: 0: %s)\n", IGetStatusString( status[ 0 ] ) );
-        clFinish( queue );
+        log_error("ERROR: Waited for first event but it's not complete "
+                  "(status: 0: %s)\n",
+                  IGetStatusString(status[0]));
+        clFinish(queue);
         return -1;
-    // If we're multiple, and the second event isn't complete, then our test event should still be queued
-    if( multiple && status[ 1 ] != CL_COMPLETE )
+    // If we're multiple, and the second event isn't complete, then our test
+    // event should still be queued
+    if (multiple && status[1] != CL_COMPLETE)
-    if( status[ 1 ] == CL_RUNNING && status[ 2 ] == CL_RUNNING ) {
-      log_error("ERROR: Test event and second event are both running.\n");
-      clFinish( queue );
-      return -1;
-    }
-        if( status[ 2 ] != CL_QUEUED && status[ 2 ] != CL_SUBMITTED )
+        if (status[1] == CL_RUNNING && status[2] == CL_RUNNING)
-            log_error( "ERROR: Test event did not wait for second event before starting! (status of ref: 1: %s, of test: 2: %s)\n", IGetStatusString( status[ 1 ] ), IGetStatusString( status[ 2 ] ) );
-            clFinish( queue );
+            log_error("ERROR: Test event and second event are both running.\n");
+            clFinish(queue);
+            return -1;
+        }
+        if (status[2] != CL_QUEUED && status[2] != CL_SUBMITTED)
+        {
+            log_error("ERROR: Test event did not wait for second event before "
+                      "starting! (status of ref: 1: %s, of test: 2: %s)\n",
+                      IGetStatusString(status[1]), IGetStatusString(status[2]));
+            clFinish(queue);
             return -1;
         // Now wait for second event to complete, too
-    if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n");
-        error = clWaitForEvents( 1, &events[ 1 ] );
-        test_error( error, "Unable to wait for second reference event" );
+        if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n");
+        error = clWaitForEvents(1, &events[1]);
+        test_error(error, "Unable to wait for second reference event");
         // Grab statuses again
-    if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
-    error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL );
-    test_error( error, "Unable to get event status" );
-    if( multiple ) {
-      if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
-      error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL );
-      test_error( error, "Unable to get event status" );
-    }
-    if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
-    error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL );
-    test_error( error, "Unable to get event status" );
+        if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n");
+        error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status[2]), &status[2], NULL);
+        test_error(error, "Unable to get event status");
+        if (multiple)
+        {
+            if (PRINT_OPS) log_info("\tChecking status of action 1...\n");
+            error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof(status[1]), &status[1], NULL);
+            test_error(error, "Unable to get event status");
+        }
+        if (PRINT_OPS) log_info("\tChecking status of action 0...\n");
+        error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(status[0]), &status[0], NULL);
+        test_error(error, "Unable to get event status");
-    log_info("\t\tEvent status after waiting for reference event 1: reference event 0: %s, reference event 1: %s, test event 2: %s.\n",
-             IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] ));
+        log_info(
+            "\t\tEvent status after waiting for reference event 1: reference "
+            "event 0: %s, reference event 1: %s, test event 2: %s.\n",
+            IGetStatusString(status[0]),
+            (multiple ? IGetStatusString(status[1]) : "N/A"),
+            IGetStatusString(status[2]));
         // Sanity
-        if( status[ 1 ] != CL_COMPLETE )
+        if (status[1] != CL_COMPLETE)
-            log_error( "ERROR: Waited for second reference event but it didn't complete (status: 1: %s)\n", IGetStatusString( status[ 1 ] ) );
-            clFinish( queue );
+            log_error("ERROR: Waited for second reference event but it didn't "
+                      "complete (status: 1: %s)\n",
+                      IGetStatusString(status[1]));
+            clFinish(queue);
             return -1;
-    // At this point, the test event SHOULD be running, but if it completed, we consider it a pass
-    if( status[ 2 ] == CL_COMPLETE )
+    // At this point, the test event SHOULD be running, but if it completed, we
+    // consider it a pass
+    if (status[2] == CL_COMPLETE)
-        log_info( "WARNING: Test event already completed. Assumed valid.\n" );
-        clFinish( queue );
+        log_info("WARNING: Test event already completed. Assumed valid.\n");
+        clFinish(queue);
         return 0;
-    if( status[ 2 ] != CL_RUNNING && status[ 2 ] != CL_SUBMITTED && status[ 2 ] != CL_QUEUED)
+    if (status[2] != CL_RUNNING && status[2] != CL_SUBMITTED
+        && status[2] != CL_QUEUED)
-        log_error( "ERROR: Second event did not start running after reference event(s) completed! (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) );
-        clFinish( queue );
+        log_error("ERROR: Second event did not start running after reference "
+                  "event(s) completed! (status: 2: %s)\n",
+                  IGetStatusString(status[2]));
+        clFinish(queue);
         return -1;
     // Wait for the test event, then return
-  if (PRINT_OPS) log_info("\tWaiting for action 2 to test to finish...\n");
-    error = clWaitForEvents( 1, &events[ 2 ] );
-    test_error( error, "Unable to wait for test event" );
+    if (PRINT_OPS) log_info("\tWaiting for action 2 to test to finish...\n");
+    error = clWaitForEvents(1, &events[2]);
+    test_error(error, "Unable to wait for test event");
-  error |= clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL );
-  test_error( error, "Unable to get event status" );
+    error |= clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS,
+                            sizeof(status[2]), &status[2], NULL);
+    test_error(error, "Unable to get event status");
-  log_info("\t\tEvent status after waiting for test event: reference event 0: %s, reference event 1: %s, test event 2: %s.\n",
-           IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] ));
+    log_info("\t\tEvent status after waiting for test event: reference event "
+             "0: %s, reference event 1: %s, test event 2: %s.\n",
+             IGetStatusString(status[0]),
+             (multiple ? IGetStatusString(status[1]) : "N/A"),
+             IGetStatusString(status[2]));
-  // Sanity
-  if( status[ 2 ] != CL_COMPLETE )
-  {
-    log_error( "ERROR: Test event didn't complete (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) );
-    clFinish( queue );
-    return -1;
-  }
+    // Sanity
+    if (status[2] != CL_COMPLETE)
+    {
+        log_error("ERROR: Test event didn't complete (status: 2: %s)\n",
+                  IGetStatusString(status[2]));
+        clFinish(queue);
+        return -1;
+    }
-  clFinish(queue);
+    clFinish(queue);
     return 0;
-#define TEST_ACTION( name ) \
-    {    \
-        name##Action action;    \
-        log_info( "-- Testing " #name " (waiting on 1 event)...\n" );    \
-        if( ( error = test_waitlist( deviceID, context, queue, &action, false ) ) != CL_SUCCESS )    \
-            retVal++;            \
-        clFinish( queue ); \
-    }    \
-    if( error == CL_SUCCESS )    /* Only run multiples test if single test passed */ \
-    {    \
-        name##Action action;    \
-        log_info( "-- Testing " #name " (waiting on 2 events)...\n" );    \
-        if( ( error = test_waitlist( deviceID, context, queue, &action, true ) ) != CL_SUCCESS )    \
-            retVal++;            \
-        clFinish( queue ); \
+#define TEST_ACTION(name)                                                      \
+    {                                                                          \
+        name##Action action;                                                   \
+        log_info("-- Testing " #name " (waiting on 1 event)...\n");            \
+        if ((error = test_waitlist(deviceID, context, queue, &action, false))  \
+            != CL_SUCCESS)                                                     \
+            retVal++;                                                          \
+        clFinish(queue);                                                       \
+    }                                                                          \
+    if (error                                                                  \
+        == CL_SUCCESS) /* Only run multiples test if single test passed */     \
+    {                                                                          \
+        name##Action action;                                                   \
+        log_info("-- Testing " #name " (waiting on 2 events)...\n");           \
+        if ((error = test_waitlist(deviceID, context, queue, &action, true))   \
+            != CL_SUCCESS)                                                     \
+            retVal++;                                                          \
+        clFinish(queue);                                                       \
-int test_waitlists( cl_device_id deviceID, cl_context context, cl_command_queue oldQueue, int num_elements )
+int test_waitlists(cl_device_id deviceID, cl_context context,
+                   cl_command_queue oldQueue, int num_elements)
     cl_int error;
     int retVal = 0;
     cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    if( !checkDeviceForQueueSupport( deviceID, props ) )
+    if (!checkDeviceForQueueSupport(deviceID, props))
-        log_info( "WARNING: Device does not support out-of-order exec mode; skipping test.\n" );
+        log_info("WARNING: Device does not support out-of-order exec mode; "
+                 "skipping test.\n");
         return 0;
-    clCommandQueueWrapper queue = clCreateCommandQueue( context, deviceID, props, &error );
+    clCommandQueueWrapper queue =
+        clCreateCommandQueue(context, deviceID, props, &error);
     test_error(error, "Unable to create out-of-order queue");
-    log_info( "\n" );
+    log_info("\n");
-    TEST_ACTION( NDRangeKernel )
+    TEST_ACTION(NDRangeKernel)
-    TEST_ACTION( ReadBuffer )
-    TEST_ACTION( WriteBuffer )
-    TEST_ACTION( MapBuffer )
-    TEST_ACTION( UnmapBuffer )
+    TEST_ACTION(ReadBuffer)
+    TEST_ACTION(WriteBuffer)
+    TEST_ACTION(MapBuffer)
+    TEST_ACTION(UnmapBuffer)
-    if( checkForImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED )
+    if (checkForImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
-        log_info( "\nNote: device does not support images. Skipping remainder of waitlist tests...\n" );
+        log_info("\nNote: device does not support images. Skipping remainder "
+                 "of waitlist tests...\n");
-        TEST_ACTION( ReadImage2D )
-        TEST_ACTION( WriteImage2D )
-        TEST_ACTION( CopyImage2Dto2D )
-        TEST_ACTION( Copy2DImageToBuffer )
-        TEST_ACTION( CopyBufferTo2DImage )
-        TEST_ACTION( MapImage )
+        TEST_ACTION(ReadImage2D)
+        TEST_ACTION(WriteImage2D)
+        TEST_ACTION(CopyImage2Dto2D)
+        TEST_ACTION(Copy2DImageToBuffer)
+        TEST_ACTION(CopyBufferTo2DImage)
+        TEST_ACTION(MapImage)
-        if( checkFor3DImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED )
-            log_info("Device does not support 3D images. Skipping remainder of waitlist tests...\n");
+        if (checkFor3DImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
+            log_info("Device does not support 3D images. Skipping remainder of "
+                     "waitlist tests...\n");
-            TEST_ACTION( ReadImage3D )
-            TEST_ACTION( WriteImage3D )
-            TEST_ACTION( CopyImage2Dto3D )
-            TEST_ACTION( CopyImage3Dto2D )
-            TEST_ACTION( CopyImage3Dto3D )
-            TEST_ACTION( Copy3DImageToBuffer )
-            TEST_ACTION( CopyBufferTo3DImage )
+            TEST_ACTION(ReadImage3D)
+            TEST_ACTION(WriteImage3D)
+            TEST_ACTION(CopyImage2Dto3D)
+            TEST_ACTION(CopyImage3Dto2D)
+            TEST_ACTION(CopyImage3Dto3D)
+            TEST_ACTION(Copy3DImageToBuffer)
+            TEST_ACTION(CopyBufferTo3DImage)
     return retVal;
diff --git a/test_conformance/extensions/CMakeLists.txt b/test_conformance/extensions/CMakeLists.txt
index 53d77ee..d95d29a 100644
--- a/test_conformance/extensions/CMakeLists.txt
+++ b/test_conformance/extensions/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory( cl_ext_cxx_for_opencl )
+add_subdirectory( cl_khr_command_buffer )
 add_subdirectory( cl_khr_dx9_media_sharing )
diff --git a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
new file mode 100644
index 0000000..ac259f6
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
@@ -0,0 +1,8 @@
+    main.cpp
+    basic_command_buffer.cpp
diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
new file mode 100644
index 0000000..62a02d8
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -0,0 +1,588 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "command_buffer_test_base.h"
+#include "procs.h"
+#include "harness/typeWrappers.h"
+#include <algorithm>
+#include <cstring>
+#include <vector>
+#define CHECK_VERIFICATION_ERROR(reference, result, index)                     \
+    {                                                                          \
+        if (reference != result)                                               \
+        {                                                                      \
+            log_error("Expected %d was %d at index %u\n", reference, result,   \
+                      index);                                                  \
+            return TEST_FAIL;                                                  \
+        }                                                                      \
+    }
+namespace {
+// Helper test fixture for constructing OpenCL objects used in testing
+// a variety of simple command-buffer enqueue scenarios.
+struct BasicCommandBufferTest : CommandBufferTestBase
+    BasicCommandBufferTest(cl_device_id device, cl_context context,
+                           cl_command_queue queue)
+        : CommandBufferTestBase(device), context(context), queue(queue),
+          command_buffer(this), simultaneous_use(false),
+          out_of_order_support(false), num_elements(0)
+    {}
+    virtual bool Skip()
+    {
+        cl_command_queue_properties required_properties;
+        cl_int error = clGetDeviceInfo(
+            sizeof(required_properties), &required_properties, NULL);
+        test_error(error,
+                   "Unable to query "
+        cl_command_queue_properties queue_properties;
+        error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+                                      sizeof(queue_properties),
+                                      &queue_properties, NULL);
+        test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
+        // Skip if queue properties don't contain those required
+        return required_properties != (required_properties & queue_properties);
+    }
+    virtual cl_int SetUp(int elements)
+    {
+        cl_int error = init_extension_functions();
+        if (error != CL_SUCCESS)
+        {
+            return error;
+        }
+        // Query if device supports simultaneous use
+        cl_device_command_buffer_capabilities_khr capabilities;
+        error =
+            clGetDeviceInfo(device, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR,
+                            sizeof(capabilities), &capabilities, NULL);
+        test_error(error,
+                   "Unable to query CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR");
+        simultaneous_use =
+        out_of_order_support =
+        if (elements <= 0)
+        {
+            return CL_INVALID_VALUE;
+        }
+        num_elements = static_cast<size_t>(elements);
+        // Kernel performs a parallel copy from an input buffer to output buffer
+        // is created.
+        const char *kernel_str =
+            R"(
+        __kernel void copy(__global int* in, __global int* out) {
+            size_t id = get_global_id(0);
+            out[id] = in[id];
+        })";
+        error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                           &kernel_str);
+        test_error(error, "Failed to create program with source");
+        error = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
+        test_error(error, "Failed to build program");
+        in_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
+                                sizeof(cl_int) * num_elements, nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+        out_mem =
+            clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                           sizeof(cl_int) * num_elements, nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+        kernel = clCreateKernel(program, "copy", &error);
+        test_error(error, "Failed to create copy kernel");
+        error = clSetKernelArg(kernel, 0, sizeof(in_mem), &in_mem);
+        test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel, 1, sizeof(out_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+        if (simultaneous_use)
+        {
+            cl_command_buffer_properties_khr properties[3] = {
+            };
+            command_buffer =
+                clCreateCommandBufferKHR(1, &queue, properties, &error);
+        }
+        else
+        {
+            command_buffer =
+                clCreateCommandBufferKHR(1, &queue, nullptr, &error);
+        }
+        test_error(error, "clCreateCommandBufferKHR failed");
+        return CL_SUCCESS;
+    }
+    // Test body returning an OpenCL error code
+    virtual cl_int Run() = 0;
+    size_t data_size() const { return num_elements * sizeof(cl_int); }
+    cl_context context;
+    cl_command_queue queue;
+    clCommandBufferWrapper command_buffer;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper in_mem, out_mem;
+    size_t num_elements;
+    // Device support query results
+    bool simultaneous_use;
+    bool out_of_order_support;
+// Test enqueuing a command-buffer containing a single NDRange command once
+struct BasicEnqueueTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        const cl_int pattern = 42;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0,
+                                    data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+// Test enqueuing a command-buffer containing multiple command, including
+// operations other than NDRange kernel execution.
+struct MixedCommandsTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    cl_int Run() override
+    {
+        cl_int error;
+        const size_t iterations = 4;
+        clMemWrapper result_mem =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * iterations, nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+        const cl_int pattern_base = 42;
+        for (size_t i = 0; i < iterations; i++)
+        {
+            const cl_int pattern = pattern_base + i;
+            cl_int error = clCommandFillBufferKHR(
+                command_buffer, nullptr, in_mem, &pattern, sizeof(cl_int), 0,
+                data_size(), 0, nullptr, nullptr, nullptr);
+            test_error(error, "clCommandFillBufferKHR failed");
+            error = clCommandNDRangeKernelKHR(
+                command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+                &num_elements, nullptr, 0, nullptr, nullptr, nullptr);
+            test_error(error, "clCommandNDRangeKernelKHR failed");
+            const size_t result_offset = i * sizeof(cl_int);
+            error = clCommandCopyBufferKHR(
+                command_buffer, nullptr, out_mem, result_mem, 0, result_offset,
+                sizeof(cl_int), 0, nullptr, nullptr, nullptr);
+            test_error(error, "clCommandCopyBufferKHR failed");
+        }
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        std::vector<cl_int> result_data(num_elements);
+        error = clEnqueueReadBuffer(queue, result_mem, CL_TRUE, 0,
+                                    iterations * sizeof(cl_int),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        for (size_t i = 0; i < iterations; i++)
+        {
+            const cl_int ref = pattern_base + i;
+            CHECK_VERIFICATION_ERROR(ref, result_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+// Test enqueueing a command-buffer blocked on a user-event
+struct UserEventTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        clEventWrapper user_event = clCreateUserEvent(context, &error);
+        test_error(error, "clCreateUserEvent failed");
+        const cl_int pattern = 42;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0,
+                                    data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
+                                          &user_event, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        error = clSetUserEventStatus(user_event, CL_COMPLETE);
+        test_error(error, "clSetUserEventStatus failed");
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+// Test flushing the command-queue between command-buffer enqueues
+struct ExplicitFlushTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        const cl_int pattern_A = 42;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_A, sizeof(cl_int),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        std::vector<cl_int> output_data_A(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        const cl_int pattern_B = 0xA;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_B, sizeof(cl_int),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
+        std::vector<cl_int> output_data_B(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_A, output_data_A[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_B, output_data_B[i], i);
+        }
+        return CL_SUCCESS;
+    }
+    bool Skip() override
+    {
+        return !simultaneous_use || BasicCommandBufferTest::Skip();
+    }
+// Test enqueueing a command-buffer twice separated by another enqueue operation
+struct InterleavedEnqueueTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        cl_int pattern = 42;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0,
+                                    data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        pattern = 0xABCD;
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0,
+                                    data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        error = clEnqueueCopyBuffer(queue, in_mem, out_mem, 0, 0, data_size(),
+                                    0, nullptr, nullptr);
+        test_error(error, "clEnqueueCopyBuffer failed");
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                          , 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+    bool Skip() override
+    {
+        return !simultaneous_use || BasicCommandBufferTest::Skip();
+    }
+// Test sync-points with an out-of-order command-buffer
+struct OutOfOrderTest : public BasicCommandBufferTest
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+    OutOfOrderTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue)
+        : BasicCommandBufferTest(device, context, queue),
+          out_of_order_command_buffer(this), out_of_order_queue(nullptr),
+          event(nullptr)
+    {}
+    cl_int Run() override
+    {
+        cl_sync_point_khr sync_points[2];
+        const cl_int pattern = 42;
+        cl_int error =
+            clCommandFillBufferKHR(out_of_order_command_buffer, nullptr, in_mem,
+                                   &pattern, sizeof(cl_int), 0, data_size(), 0,
+                                   nullptr, &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+        const cl_int overwritten_pattern = 0xACDC;
+        error = clCommandFillBufferKHR(out_of_order_command_buffer, nullptr,
+                                       out_mem, &overwritten_pattern,
+                                       sizeof(cl_int), 0, data_size(), 0,
+                                       nullptr, &sync_points[1], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+        error = clCommandNDRangeKernelKHR(
+            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(),, 1, &event,
+                                    nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp failed");
+        if (!out_of_order_support)
+        {
+            // Test will skip as device doesn't support out-of-order
+            // command-buffers
+            return CL_SUCCESS;
+        }
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error, "Unable to create command queue to test with");
+        out_of_order_command_buffer =
+            clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+        return CL_SUCCESS;
+    }
+    bool Skip() override
+    {
+        return !out_of_order_support || BasicCommandBufferTest::Skip();
+    }
+    clCommandQueueWrapper out_of_order_queue;
+    clCommandBufferWrapper out_of_order_command_buffer;
+    clEventWrapper event;
+template <class T>
+int MakeAndRunTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements)
+    auto test_fixture = T(device, context, queue);
+    cl_int error = test_fixture.SetUp(num_elements);
+    test_error_ret(error, "Error in test initialization", TEST_FAIL);
+    if (test_fixture.Skip())
+    {
+        return TEST_SKIPPED_ITSELF;
+    }
+    error = test_fixture.Run();
+    test_error_ret(error, "Test Failed", TEST_FAIL);
+    return TEST_PASS;
+} // anonymous namespace
+int test_single_ndrange(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<BasicEnqueueTest>(device, context, queue,
+                                            num_elements);
+int test_interleaved_enqueue(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<InterleavedEnqueueTest>(device, context, queue,
+                                                  num_elements);
+int test_mixed_commands(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<MixedCommandsTest>(device, context, queue,
+                                             num_elements);
+int test_explicit_flush(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<ExplicitFlushTest>(device, context, queue,
+                                             num_elements);
+int test_user_events(cl_device_id device, cl_context context,
+                     cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<UserEventTest>(device, context, queue, num_elements);
+int test_out_of_order(cl_device_id device, cl_context context,
+                      cl_command_queue queue, int num_elements)
+    return MakeAndRunTest<OutOfOrderTest>(device, context, queue, num_elements);
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
new file mode 100644
index 0000000..0fd2e4e
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <CL/cl_ext.h>
+#include "harness/deviceInfo.h"
+#include "harness/testHarness.h"
+// Base class for setting function pointers to new extension entry points
+struct CommandBufferTestBase
+    CommandBufferTestBase(cl_device_id device): device(device) {}
+    cl_int init_extension_functions()
+    {
+        cl_platform_id platform;
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                            &platform, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+        // If it is supported get the addresses of all the APIs here.
+#define GET_EXTENSION_ADDRESS(FUNC)                                            \
+    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
+        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
+    if (FUNC == nullptr)                                                       \
+    {                                                                          \
+        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
+                  " with " #FUNC "\n");                                        \
+        return TEST_FAIL;                                                      \
+    }
+        GET_EXTENSION_ADDRESS(clCreateCommandBufferKHR);
+        GET_EXTENSION_ADDRESS(clReleaseCommandBufferKHR);
+        GET_EXTENSION_ADDRESS(clRetainCommandBufferKHR);
+        GET_EXTENSION_ADDRESS(clFinalizeCommandBufferKHR);
+        GET_EXTENSION_ADDRESS(clEnqueueCommandBufferKHR);
+        GET_EXTENSION_ADDRESS(clCommandBarrierWithWaitListKHR);
+        GET_EXTENSION_ADDRESS(clCommandCopyBufferKHR);
+        GET_EXTENSION_ADDRESS(clCommandCopyBufferRectKHR);
+        GET_EXTENSION_ADDRESS(clCommandCopyBufferToImageKHR);
+        GET_EXTENSION_ADDRESS(clCommandCopyImageKHR);
+        GET_EXTENSION_ADDRESS(clCommandCopyImageToBufferKHR);
+        GET_EXTENSION_ADDRESS(clCommandFillBufferKHR);
+        GET_EXTENSION_ADDRESS(clCommandFillImageKHR);
+        GET_EXTENSION_ADDRESS(clCommandNDRangeKernelKHR);
+        GET_EXTENSION_ADDRESS(clGetCommandBufferInfoKHR);
+        return CL_SUCCESS;
+    }
+    clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr;
+    clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = nullptr;
+    clRetainCommandBufferKHR_fn clRetainCommandBufferKHR = nullptr;
+    clFinalizeCommandBufferKHR_fn clFinalizeCommandBufferKHR = nullptr;
+    clEnqueueCommandBufferKHR_fn clEnqueueCommandBufferKHR = nullptr;
+    clCommandBarrierWithWaitListKHR_fn clCommandBarrierWithWaitListKHR =
+        nullptr;
+    clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = nullptr;
+    clCommandCopyBufferRectKHR_fn clCommandCopyBufferRectKHR = nullptr;
+    clCommandCopyBufferToImageKHR_fn clCommandCopyBufferToImageKHR = nullptr;
+    clCommandCopyImageKHR_fn clCommandCopyImageKHR = nullptr;
+    clCommandCopyImageToBufferKHR_fn clCommandCopyImageToBufferKHR = nullptr;
+    clCommandFillBufferKHR_fn clCommandFillBufferKHR = nullptr;
+    clCommandFillImageKHR_fn clCommandFillImageKHR = nullptr;
+    clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr;
+    clGetCommandBufferInfoKHR_fn clGetCommandBufferInfoKHR = nullptr;
+    cl_device_id device = nullptr;
+// Wrapper class based off generic typeWrappers.h wrappers. However, because
+// the release/retain functions are queried at runtime from the platform,
+// rather than known at compile time we cannot link the instantiated template.
+// Instead, pass an instance of `CommandBufferTestBase` on wrapper construction
+// to access the release/retain functions.
+class clCommandBufferWrapper {
+    cl_command_buffer_khr object = nullptr;
+    void retain()
+    {
+        if (!object) return;
+        auto err = base->clRetainCommandBufferKHR(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clRetainCommandBufferKHR() failed");
+            std::abort();
+        }
+    }
+    void release()
+    {
+        if (!object) return;
+        auto err = base->clReleaseCommandBufferKHR(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clReleaseCommandBufferKHR() failed");
+            std::abort();
+        }
+    }
+    // Used to access release/retain functions
+    CommandBufferTestBase *base;
+    // We always want to have base available to dereference
+    clCommandBufferWrapper() = delete;
+    clCommandBufferWrapper(CommandBufferTestBase *base): base(base) {}
+    // On assignment, assume the object has a refcount of one.
+    clCommandBufferWrapper &operator=(cl_command_buffer_khr rhs)
+    {
+        reset(rhs);
+        return *this;
+    }
+    // Copy semantics, increase retain count.
+    clCommandBufferWrapper(clCommandBufferWrapper const &w) { *this = w; }
+    clCommandBufferWrapper &operator=(clCommandBufferWrapper const &w)
+    {
+        reset(w.object);
+        retain();
+        return *this;
+    }
+    // Move semantics, directly take ownership.
+    clCommandBufferWrapper(clCommandBufferWrapper &&w) { *this = std::move(w); }
+    clCommandBufferWrapper &operator=(clCommandBufferWrapper &&w)
+    {
+        reset(w.object);
+        w.object = nullptr;
+        return *this;
+    }
+    ~clCommandBufferWrapper() { reset(); }
+    // Release the existing object, if any, and own the new one, if any.
+    void reset(cl_command_buffer_khr new_object = nullptr)
+    {
+        release();
+        object = new_object;
+    }
+    operator cl_command_buffer_khr() const { return object; }
+#define CHECK_COMMAND_BUFFER_EXTENSION_AVAILABLE(device)                       \
+    {                                                                          \
+        if (!is_extension_available(device, "cl_khr_command_buffer"))          \
+        {                                                                      \
+            log_info(                                                          \
+                "Device does not support 'cl_khr_command_buffer'. Skipping "   \
+                "the test.\n");                                                \
+            return TEST_SKIPPED_ITSELF;                                        \
+        }                                                                      \
+    }
diff --git a/test_conformance/extensions/cl_khr_command_buffer/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
new file mode 100644
index 0000000..4dece45
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "procs.h"
+#include "harness/testHarness.h"
+test_definition test_list[] = {
+    ADD_TEST(single_ndrange), ADD_TEST(interleaved_enqueue),
+    ADD_TEST(mixed_commands), ADD_TEST(explicit_flush),
+    ADD_TEST(user_events),    ADD_TEST(out_of_order)
+int main(int argc, const char *argv[])
+    // A device may report the required properties of a queue that
+    // is compatible with command-buffers via the query
+    // for this in the tests themselves, rather than here, where we have a
+    // device to query.
+    const cl_command_queue_properties queue_properties = 0;
+    return runTestHarnessWithCheck(argc, argv, ARRAY_SIZE(test_list), test_list,
+                                   false, queue_properties, nullptr);
diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h
new file mode 100644
index 0000000..58fd228
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <CL/cl.h>
+// Basic command-buffer tests
+extern int test_single_ndrange(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_interleaved_enqueue(cl_device_id device, cl_context context,
+                                    cl_command_queue queue, int num_elements);
+extern int test_mixed_commands(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_explicit_flush(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_user_events(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements);
+extern int test_out_of_order(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements);
diff --git a/test_conformance/gl/common.h b/test_conformance/gl/common.h
index 36221da..d8587cf 100644
--- a/test_conformance/gl/common.h
+++ b/test_conformance/gl/common.h
@@ -32,8 +32,8 @@
 // These are the typically tested formats.
-static struct format common_formats[] = {
+// clang-format off
+static const format common_formats[] = {
 #ifdef __APPLE__
   { GL_RGBA8,        GL_BGRA,             GL_UNSIGNED_INT_8_8_8_8,         kUChar },
   { GL_RGBA8,        GL_BGRA,             GL_UNSIGNED_INT_8_8_8_8_REV,     kUChar },
@@ -53,25 +53,30 @@
 #ifdef GL_VERSION_3_2
-static struct format depth_formats[] = {
+static const format depth_formats[] = {
   { GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT,                          kFloat },
   { GL_DEPTH24_STENCIL8,   GL_DEPTH_STENCIL,   GL_UNSIGNED_INT_24_8,              kUInt },
+// clang-format on
 int test_images_write_common(cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t* sizes, size_t nsizes );
+                             cl_command_queue queue, const format *formats,
+                             size_t nformats, GLenum *targets, size_t ntargets,
+                             sizevec_t *sizes, size_t nsizes);
-int test_images_read_common( cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes );
+int test_images_read_common(cl_device_id device, cl_context context,
+                            cl_command_queue queue, const format *formats,
+                            size_t nformats, GLenum *targets, size_t ntargets,
+                            sizevec_t *sizes, size_t nsizes);
-int test_images_get_info_common( cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes );
+int test_images_get_info_common(cl_device_id device, cl_context context,
+                                cl_command_queue queue, const format *formats,
+                                size_t nformats, GLenum *targets,
+                                size_t ntargets, sizevec_t *sizes,
+                                size_t nsizes);
 int is_rgb_101010_supported( cl_context context, GLenum gl_target );
diff --git a/test_conformance/gl/test_buffers.cpp b/test_conformance/gl/test_buffers.cpp
index 35f01ee..c61610d 100644
--- a/test_conformance/gl/test_buffers.cpp
+++ b/test_conformance/gl/test_buffers.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,126 +17,126 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
-#if !defined (__APPLE__)
-    #include <CL/cl_gl.h>
+#if !defined(__APPLE__)
+#include <CL/cl_gl.h>
 static const char *bufferKernelPattern =
-"__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, __global %s%s *glDest )\n"
-"    int  tid = get_global_id(0);\n"
-"     clDest[ tid ] = source[ tid ] + (%s%s)(1);\n"
-"     glDest[ tid ] = source[ tid ] + (%s%s)(2);\n"
+    "__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, "
+    "__global %s%s *glDest )\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     clDest[ tid ] = source[ tid ] + (%s%s)(1);\n"
+    "     glDest[ tid ] = source[ tid ] + (%s%s)(2);\n"
+    "}\n";
-#define TYPE_CASE( enum, type, range, offset )    \
-    case enum:    \
-    {                \
-        cl_##type *ptr = (cl_##type *)outData; \
-        for( i = 0; i < count; i++ ) \
-            ptr[ i ] = (cl_##type)( ( genrand_int32(d) & range ) - offset ); \
-        break; \
+#define TYPE_CASE(enum, type, range, offset)                                   \
+    case enum: {                                                               \
+        cl_##type *ptr = (cl_##type *)outData;                                 \
+        for (i = 0; i < count; i++)                                            \
+            ptr[i] = (cl_##type)((genrand_int32(d) & range) - offset);         \
+        break;                                                                 \
-void gen_input_data( ExplicitType type, size_t count, MTdata d, void *outData )
+void gen_input_data(ExplicitType type, size_t count, MTdata d, void *outData)
     size_t i;
-    switch( type )
+    switch (type)
-        case kBool:
-        {
+        case kBool: {
             bool *boolPtr = (bool *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
-                boolPtr[i] = ( genrand_int32(d) & 1 ) ? true : false;
+                boolPtr[i] = (genrand_int32(d) & 1) ? true : false;
-        TYPE_CASE( kChar, char, 250, 127 )
-        TYPE_CASE( kUChar, uchar, 250, 0 )
-        TYPE_CASE( kShort, short, 65530, 32767 )
-        TYPE_CASE( kUShort, ushort, 65530, 0 )
-        TYPE_CASE( kInt, int, 0x0fffffff, 0x70000000 )
-        TYPE_CASE( kUInt, uint, 0x0fffffff, 0 )
+            TYPE_CASE(kChar, char, 250, 127)
+            TYPE_CASE(kUChar, uchar, 250, 0)
+            TYPE_CASE(kShort, short, 65530, 32767)
+            TYPE_CASE(kUShort, ushort, 65530, 0)
+            TYPE_CASE(kInt, int, 0x0fffffff, 0x70000000)
+            TYPE_CASE(kUInt, uint, 0x0fffffff, 0)
-        case kLong:
-        {
+        case kLong: {
             cl_long *longPtr = (cl_long *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
-                longPtr[i] = (cl_long)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 );
+                longPtr[i] = (cl_long)genrand_int32(d)
+                    | ((cl_ulong)genrand_int32(d) << 32);
-        case kULong:
-        {
+        case kULong: {
             cl_ulong *ulongPtr = (cl_ulong *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
-                ulongPtr[i] = (cl_ulong)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 );
+                ulongPtr[i] = (cl_ulong)genrand_int32(d)
+                    | ((cl_ulong)genrand_int32(d) << 32);
-        case kFloat:
-        {
+        case kFloat: {
             cl_float *floatPtr = (float *)outData;
-            for( i = 0; i < count; i++ )
-                floatPtr[i] = get_random_float( -100000.f, 100000.f, d );
+            for (i = 0; i < count; i++)
+                floatPtr[i] = get_random_float(-100000.f, 100000.f, d);
-            log_error( "ERROR: Invalid type passed in to generate_random_data!\n" );
+            log_error(
+                "ERROR: Invalid type passed in to generate_random_data!\n");
-#define INC_CASE( enum, type )    \
-    case enum:    \
-    {                \
-        cl_##type *src = (cl_##type *)inData; \
-        cl_##type *dst = (cl_##type *)outData; \
-        *dst = *src + 1; \
-        break; \
+#define INC_CASE(enum, type)                                                   \
+    case enum: {                                                               \
+        cl_##type *src = (cl_##type *)inData;                                  \
+        cl_##type *dst = (cl_##type *)outData;                                 \
+        *dst = *src + 1;                                                       \
+        break;                                                                 \
-void get_incremented_value( void *inData, void *outData, ExplicitType type )
+void get_incremented_value(void *inData, void *outData, ExplicitType type)
-    switch( type )
+    switch (type)
-        INC_CASE( kChar, char )
-        INC_CASE( kUChar, uchar )
-        INC_CASE( kShort, short )
-        INC_CASE( kUShort, ushort )
-        INC_CASE( kInt, int )
-        INC_CASE( kUInt, uint )
-        INC_CASE( kLong, long )
-        INC_CASE( kULong, ulong )
-        INC_CASE( kFloat, float )
-        default:
-            break;
+        INC_CASE(kChar, char)
+        INC_CASE(kUChar, uchar)
+        INC_CASE(kShort, short)
+        INC_CASE(kUShort, ushort)
+        INC_CASE(kInt, int)
+        INC_CASE(kUInt, uint)
+        INC_CASE(kLong, long)
+        INC_CASE(kULong, ulong)
+        INC_CASE(kFloat, float)
+        default: break;
-int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType vecType, size_t vecSize, int numElements, int validate_only, MTdata d)
+int test_buffer_kernel(cl_context context, cl_command_queue queue,
+                       ExplicitType vecType, size_t vecSize, int numElements,
+                       int validate_only, MTdata d)
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper streams[ 3 ];
+    clMemWrapper streams[3];
     size_t dataSize = numElements * 16 * sizeof(cl_long);
 #if !(defined(_WIN32) && defined(_MSC_VER))
-    cl_long inData[numElements * 16], outDataCL[numElements * 16], outDataGL[ numElements * 16 ];
+    cl_long inData[numElements * 16], outDataCL[numElements * 16],
+        outDataGL[numElements * 16];
-    cl_long* inData    = (cl_long*)_malloca(dataSize);
-    cl_long* outDataCL = (cl_long*)_malloca(dataSize);
-    cl_long* outDataGL = (cl_long*)_malloca(dataSize);
+    cl_long *inData = (cl_long *)_malloca(dataSize);
+    cl_long *outDataCL = (cl_long *)_malloca(dataSize);
+    cl_long *outDataGL = (cl_long *)_malloca(dataSize);
     glBufferWrapper inGLBuffer, outGLBuffer;
-    int    i;
+    int i;
     size_t bufferSize;
     int error;
@@ -146,210 +146,259 @@
     char sizeName[4];
     /* Create the source */
-    if( vecSize == 1 )
-        sizeName[ 0 ] = 0;
+    if (vecSize == 1)
+        sizeName[0] = 0;
-        sprintf( sizeName, "%d", (int)vecSize );
+        sprintf(sizeName, "%d", (int)vecSize);
-    sprintf( kernelSource, bufferKernelPattern, get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName );
+    sprintf(kernelSource, bufferKernelPattern, get_explicit_type_name(vecType),
+            sizeName, get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName);
     /* Create kernels */
     programPtr = kernelSource;
-    if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programPtr, "sample_test"))
         return -1;
-    bufferSize = numElements * vecSize * get_explicit_type_size( vecType );
+    bufferSize = numElements * vecSize * get_explicit_type_size(vecType);
     /* Generate some almost-random input data */
-    gen_input_data( vecType, vecSize * numElements, d, inData );
-    memset( outDataCL, 0, dataSize );
-    memset( outDataGL, 0, dataSize );
+    gen_input_data(vecType, vecSize * numElements, d, inData);
+    memset(outDataCL, 0, dataSize);
+    memset(outDataGL, 0, dataSize);
     /* Generate some GL buffers to go against */
-    glGenBuffers( 1, &inGLBuffer );
-    glGenBuffers( 1, &outGLBuffer );
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
-    glBindBuffer( GL_ARRAY_BUFFER, inGLBuffer );
-    glBufferData( GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW);
-    // Note: we need to bind the output buffer, even though we don't care about its values yet,
-    // because CL needs it to get the buffer size
-    glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer );
-    glBufferData( GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW );
+    // Note: we need to bind the output buffer, even though we don't care about
+    // its values yet, because CL needs it to get the buffer size
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW);
-    glBindBuffer( GL_ARRAY_BUFFER, 0 );
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    /* Generate some streams. The first and last ones are GL, middle one just vanilla CL */
-    streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_ONLY, inGLBuffer, &error );
-    test_error( error, "Unable to create input GL buffer" );
+    /* Generate some streams. The first and last ones are GL, middle one just
+     * vanilla CL */
+    streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_ONLY,
+                                             inGLBuffer, &error);
+    test_error(error, "Unable to create input GL buffer");
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSize, NULL, &error );
-    test_error( error, "Unable to create output CL buffer" );
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
+    test_error(error, "Unable to create output CL buffer");
-    streams[ 2 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_WRITE_ONLY, outGLBuffer, &error );
-    test_error( error, "Unable to create output GL buffer" );
+    streams[2] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_WRITE_ONLY,
+                                             outGLBuffer, &error);
+    test_error(error, "Unable to create output GL buffer");
-  /* Validate the info */
-  if (validate_only) {
-    int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) |
-                  CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) );
-    for(i=0;i<3;i++)
+    /* Validate the info */
+    if (validate_only)
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER,
+                                        (GLuint)inGLBuffer, (GLenum)0, 0)
+                      | CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER,
+                                          (GLuint)outGLBuffer, (GLenum)0, 0));
+        for (i = 0; i < 3; i++)
+        {
+            streams[i].reset();
+        }
+        glDeleteBuffers(1, &inGLBuffer);
+        inGLBuffer = 0;
+        glDeleteBuffers(1, &outGLBuffer);
+        outGLBuffer = 0;
+        return result;
-    glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
-    glDeleteBuffers(1, &outGLBuffer);    outGLBuffer = 0;
-    return result;
-  }
     /* Assign streams and execute */
-    for( int i = 0; i < 3; i++ )
+    for (int i = 0; i < 3; i++)
-        error = clSetKernelArg( kernel, i, sizeof( streams[ i ] ), &streams[ i ] );
-        test_error( error, "Unable to set kernel arguments" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel arguments");
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL);
-  test_error( error, "Unable to acquire GL obejcts");
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL);
-  test_error( error, "Unable to acquire GL obejcts");
+    error =
+        (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL);
+    test_error(error, "Unable to acquire GL obejcts");
+    error =
+        (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL);
+    test_error(error, "Unable to acquire GL obejcts");
     /* Run the kernel */
     threads[0] = numElements;
-    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error = get_max_common_work_group_size(context, kernel, threads[0],
+                                           &localThreads[0]);
+    test_error(error, "Unable to get work group size to use");
-  error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to execute test kernel");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL );
-  test_error(error, "clEnqueueReleaseGLObjects failed");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL );
-  test_error(error, "clEnqueueReleaseGLObjects failed");
+    error =
+        (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL);
+    test_error(error, "clEnqueueReleaseGLObjects failed");
+    error =
+        (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL);
+    test_error(error, "clEnqueueReleaseGLObjects failed");
-    // Get the results from both CL and GL and make sure everything looks correct
-    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, bufferSize, outDataCL, 0, NULL, NULL );
-    test_error( error, "Unable to read output CL array!" );
+    // Get the results from both CL and GL and make sure everything looks
+    // correct
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, bufferSize,
+                                outDataCL, 0, NULL, NULL);
+    test_error(error, "Unable to read output CL array!");
-    glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer );
-    void *glMem = glMapBuffer( GL_ARRAY_BUFFER, GL_READ_ONLY );
-    memcpy( outDataGL, glMem, bufferSize );
-    glUnmapBuffer( GL_ARRAY_BUFFER );
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(outDataGL, glMem, bufferSize);
+    glUnmapBuffer(GL_ARRAY_BUFFER);
-    char *inP = (char *)inData, *glP = (char *)outDataGL, *clP = (char *)outDataCL;
+    char *inP = (char *)inData, *glP = (char *)outDataGL,
+         *clP = (char *)outDataCL;
     error = 0;
-    for( size_t i = 0; i < numElements * vecSize; i++ )
+    for (size_t i = 0; i < numElements * vecSize; i++)
         cl_long expectedCLValue, expectedGLValue;
-        get_incremented_value( inP, &expectedCLValue, vecType );
-        get_incremented_value( &expectedCLValue, &expectedGLValue, vecType );
+        get_incremented_value(inP, &expectedCLValue, vecType);
+        get_incremented_value(&expectedCLValue, &expectedGLValue, vecType);
-        if( memcmp( clP, &expectedCLValue, get_explicit_type_size( vecType ) ) != 0 )
+        if (memcmp(clP, &expectedCLValue, get_explicit_type_size(vecType)) != 0)
-            char scratch[ 64 ];
-            log_error( "ERROR: Data sample %d from the CL output did not validate!\n", (int)i );
-            log_error( "\t   Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\tExpected: %s\n", GetDataVectorString( &expectedCLValue, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\t  Actual: %s\n", GetDataVectorString( clP, get_explicit_type_size( vecType ), 1, scratch ) );
+            char scratch[64];
+            log_error(
+                "ERROR: Data sample %d from the CL output did not validate!\n",
+                (int)i);
+            log_error("\t   Input: %s\n",
+                      GetDataVectorString(inP, get_explicit_type_size(vecType),
+                                          1, scratch));
+            log_error("\tExpected: %s\n",
+                      GetDataVectorString(&expectedCLValue,
+                                          get_explicit_type_size(vecType), 1,
+                                          scratch));
+            log_error("\t  Actual: %s\n",
+                      GetDataVectorString(clP, get_explicit_type_size(vecType),
+                                          1, scratch));
             error = -1;
-        if( memcmp( glP, &expectedGLValue, get_explicit_type_size( vecType ) ) != 0 )
+        if (memcmp(glP, &expectedGLValue, get_explicit_type_size(vecType)) != 0)
-            char scratch[ 64 ];
-            log_error( "ERROR: Data sample %d from the GL output did not validate!\n", (int)i );
-            log_error( "\t   Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\tExpected: %s\n", GetDataVectorString( &expectedGLValue, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\t  Actual: %s\n", GetDataVectorString( glP, get_explicit_type_size( vecType ), 1, scratch ) );
+            char scratch[64];
+            log_error(
+                "ERROR: Data sample %d from the GL output did not validate!\n",
+                (int)i);
+            log_error("\t   Input: %s\n",
+                      GetDataVectorString(inP, get_explicit_type_size(vecType),
+                                          1, scratch));
+            log_error("\tExpected: %s\n",
+                      GetDataVectorString(&expectedGLValue,
+                                          get_explicit_type_size(vecType), 1,
+                                          scratch));
+            log_error("\t  Actual: %s\n",
+                      GetDataVectorString(glP, get_explicit_type_size(vecType),
+                                          1, scratch));
             error = -1;
-        if( error )
-            return error;
+        if (error) return error;
-        inP += get_explicit_type_size( vecType );
-        glP += get_explicit_type_size( vecType );
-        clP += get_explicit_type_size( vecType );
+        inP += get_explicit_type_size(vecType);
+        glP += get_explicit_type_size(vecType);
+        clP += get_explicit_type_size(vecType);
-    for(i=0;i<3;i++)
+    for (i = 0; i < 3; i++)
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
-    glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
-    glDeleteBuffers(1, &outGLBuffer);    outGLBuffer = 0;
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
     return 0;
-int test_buffers( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_buffers(cl_device_id device, cl_context context,
+                 cl_command_queue queue, int numElements)
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes };
+    ExplicitType vecType[] = {
+        kChar, kUChar, kShort, kUShort, kInt,
+        kUInt, kLong,  kULong, kFloat,  kNumExplicitTypes
+    };
     unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 };
     unsigned int index, typeIndex;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
-        for( index = 0; vecSizes[ index ] != 0; index++ )
+        for (index = 0; vecSizes[index] != 0; index++)
             // Test!
-            if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 0, seed) != 0 )
+            if (test_buffer_kernel(context, queue, vecType[typeIndex],
+                                   vecSizes[index], numElements, 0, seed)
+                != 0)
-                char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
-                log_error( "   Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] );
+                char sizeNames[][4] = { "", "", "2", "", "4", "", "", "",  "8",
+                                        "", "", "",  "", "",  "", "", "16" };
+                log_error("   Buffer test %s%s FAILED\n",
+                          get_explicit_type_name(vecType[typeIndex]),
+                          sizeNames[vecSizes[index]]);
     return retVal;
-int test_buffers_getinfo( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_buffers_getinfo(cl_device_id device, cl_context context,
+                         cl_command_queue queue, int numElements)
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes };
+    ExplicitType vecType[] = {
+        kChar, kUChar, kShort, kUShort, kInt,
+        kUInt, kLong,  kULong, kFloat,  kNumExplicitTypes
+    };
     unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 };
     unsigned int index, typeIndex;
     int retVal = 0;
-    RandomSeed seed( gRandomSeed );
+    RandomSeed seed(gRandomSeed);
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
-        for( index = 0; vecSizes[ index ] != 0; index++ )
+        for (index = 0; vecSizes[index] != 0; index++)
             // Test!
-            if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 1, seed ) != 0 )
+            if (test_buffer_kernel(context, queue, vecType[typeIndex],
+                                   vecSizes[index], numElements, 1, seed)
+                != 0)
-                char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
-                log_error( "   Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] );
+                char sizeNames[][4] = { "", "", "2", "", "4", "", "", "",  "8",
+                                        "", "", "",  "", "",  "", "", "16" };
+                log_error("   Buffer test %s%s FAILED\n",
+                          get_explicit_type_name(vecType[typeIndex]),
+                          sizeNames[vecSizes[index]]);
     return retVal;
diff --git a/test_conformance/gl/test_fence_sync.cpp b/test_conformance/gl/test_fence_sync.cpp
index 00bf2cc..35cc62d 100644
--- a/test_conformance/gl/test_fence_sync.cpp
+++ b/test_conformance/gl/test_fence_sync.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,7 +17,7 @@
 #include "gl/setup.h"
 #include "harness/genericThread.h"
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
 #include <OpenGL/glu.h>
 #include <GL/glu.h>
@@ -40,112 +40,121 @@
 #define APIENTRY
-typedef GLsync (APIENTRY *glFenceSyncPtr)(GLenum condition,GLbitfield flags);
+typedef GLsync(APIENTRY *glFenceSyncPtr)(GLenum condition, GLbitfield flags);
 glFenceSyncPtr glFenceSyncFunc;
-typedef bool (APIENTRY *glIsSyncPtr)(GLsync sync);
+typedef bool(APIENTRY *glIsSyncPtr)(GLsync sync);
 glIsSyncPtr glIsSyncFunc;
-typedef void (APIENTRY *glDeleteSyncPtr)(GLsync sync);
+typedef void(APIENTRY *glDeleteSyncPtr)(GLsync sync);
 glDeleteSyncPtr glDeleteSyncFunc;
-typedef GLenum (APIENTRY *glClientWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout);
+typedef GLenum(APIENTRY *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                              GLuint64 timeout);
 glClientWaitSyncPtr glClientWaitSyncFunc;
-typedef void (APIENTRY *glWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout);
+typedef void(APIENTRY *glWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                      GLuint64 timeout);
 glWaitSyncPtr glWaitSyncFunc;
-typedef void (APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void(APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
 glGetInteger64vPtr glGetInteger64vFunc;
-typedef void (APIENTRY *glGetSyncivPtr)(GLsync sync,GLenum pname,GLsizei bufSize,GLsizei *length,
-                               GLint *values);
+typedef void(APIENTRY *glGetSyncivPtr)(GLsync sync, GLenum pname,
+                                       GLsizei bufSize, GLsizei *length,
+                                       GLint *values);
 glGetSyncivPtr glGetSyncivFunc;
 #define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError()))
-static void InitSyncFns( void )
+static void InitSyncFns(void)
-    glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress( "glFenceSync" );
-    glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress( "glIsSync" );
-    glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress( "glDeleteSync" );
-    glClientWaitSyncFunc = (glClientWaitSyncPtr)glutGetProcAddress( "glClientWaitSync" );
-    glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress( "glWaitSync" );
-    glGetInteger64vFunc = (glGetInteger64vPtr)glutGetProcAddress( "glGetInteger64v" );
-    glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress( "glGetSynciv" );
+    glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress("glFenceSync");
+    glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress("glIsSync");
+    glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress("glDeleteSync");
+    glClientWaitSyncFunc =
+        (glClientWaitSyncPtr)glutGetProcAddress("glClientWaitSync");
+    glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress("glWaitSync");
+    glGetInteger64vFunc =
+        (glGetInteger64vPtr)glutGetProcAddress("glGetInteger64v");
+    glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress("glGetSynciv");
 #ifndef GL_ARB_sync
-#define GL_MAX_SERVER_WAIT_TIMEOUT        0x9111
-#define GL_OBJECT_TYPE            0x9112
-#define GL_SYNC_CONDITION            0x9113
-#define GL_SYNC_STATUS            0x9114
-#define GL_SYNC_FLAGS            0x9115
+#define GL_OBJECT_TYPE 0x9112
+#define GL_SYNC_CONDITION 0x9113
+#define GL_SYNC_STATUS 0x9114
+#define GL_SYNC_FLAGS 0x9115
-#define GL_SYNC_FENCE            0x9116
+#define GL_SYNC_FENCE 0x9116
-#define GL_UNSIGNALED            0x9118
-#define GL_SIGNALED            0x9119
+#define GL_UNSIGNALED 0x9118
+#define GL_SIGNALED 0x9119
-#define GL_SYNC_FLUSH_COMMANDS_BIT        0x00000001
+#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
-#define GL_ALREADY_SIGNALED        0x911A
-#define GL_TIMEOUT_EXPIRED            0x911B
-#define GL_CONDITION_SATISFIED        0x911C
-#define GL_WAIT_FAILED            0x911D
+#define GL_TIMEOUT_EXPIRED 0x911B
+#define GL_WAIT_FAILED 0x911D
 #define USING_ARB_sync 1
-typedef cl_event (CL_API_CALL *clCreateEventFromGLsyncKHR_fn)( cl_context context, GLsync sync, cl_int *errCode_ret) ;
+typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)(
+    cl_context context, GLsync sync, cl_int *errCode_ret);
 clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr;
 static const char *updateBuffersKernel[] = {
-    "__kernel void update( __global float4 * vertices, __global float4 *colors, int horizWrap, int rowIdx )\n"
+    "__kernel void update( __global float4 * vertices, __global float4 "
+    "*colors, int horizWrap, int rowIdx )\n"
     "    size_t tid = get_global_id(0);\n"
     "    size_t xVal = ( tid & ( horizWrap - 1 ) );\n"
     "    vertices[ tid * 2 + 0 ] = (float4)( xVal, rowIdx*16.f, 0.0f, 1.f );\n"
-    "    vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, 1.f );\n"
+    "    vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, "
+    "1.f );\n"
     "    int rowV = rowIdx + 1;\n"
-    "    colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 ) >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n"
-    "    //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, 1.0f, 1.0f, 1.0f );\n"
+    "    colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 "
+    ") >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n"
+    "    //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, "
+    "1.0f, 1.0f, 1.0f );\n"
     "    colors[ tid * 2 + 1 ] = colors[ tid * 2 + 0 ];\n"
-    "}\n" };
+    "}\n"
-//Passthrough VertexShader
-static const char *vertexshader =
-"#version 150\n"
-"uniform mat4 projMatrix;\n"
-"in vec4 inPosition;\n"
-"in vec4 inColor;\n"
-"out vec4 vertColor;\n"
-"void main (void) {\n"
-"    gl_Position = projMatrix*inPosition;\n"
-"   vertColor = inColor;\n"
+// Passthrough VertexShader
+static const char *vertexshader = "#version 150\n"
+                                  "uniform mat4 projMatrix;\n"
+                                  "in vec4 inPosition;\n"
+                                  "in vec4 inColor;\n"
+                                  "out vec4 vertColor;\n"
+                                  "void main (void) {\n"
+                                  "    gl_Position = projMatrix*inPosition;\n"
+                                  "   vertColor = inColor;\n"
+                                  "}\n";
-//Passthrough FragmentShader
-static const char *fragmentshader =
-"#version 150\n"
-"in vec4 vertColor;\n"
-"out vec4 outColor;\n"
-"void main (void) {\n"
-"    outColor = vertColor;\n"
+// Passthrough FragmentShader
+static const char *fragmentshader = "#version 150\n"
+                                    "in vec4 vertColor;\n"
+                                    "out vec4 outColor;\n"
+                                    "void main (void) {\n"
+                                    "    outColor = vertColor;\n"
+                                    "}\n";
 GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
-    GLint  logLength, status;
+    GLint logLength, status;
     GLuint program = glCreateProgram();
     GLuint vpShader;
@@ -153,8 +162,9 @@
     glShaderSource(vpShader, 1, (const GLchar **)&vertexshader, NULL);
     glGetShaderiv(vpShader, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*) malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetShaderInfoLog(vpShader, logLength, &logLength, log);
         log_info("Vtx Shader compile log:\n%s", log);
@@ -175,8 +185,9 @@
     glGetShaderiv(fpShader, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetShaderInfoLog(fpShader, logLength, &logLength, log);
         log_info("Frag Shader compile log:\n%s", log);
@@ -192,8 +203,9 @@
     glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetProgramInfoLog(program, logLength, &logLength, log);
         log_info("Program link log:\n%s", log);
@@ -219,7 +231,7 @@
     glGetAttachedShaders(program, 2, &count, shaders);
     int i;
-    for(i = 0; i < count; i++)
+    for (i = 0; i < count; i++)
         glDetachShader(program, shaders[i]);
@@ -227,44 +239,49 @@
-// This function queues up and runs the above CL kernel that writes the vertex data
-cl_int run_cl_kernel( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1,
-                     cl_int rowIdx, cl_event fenceEvent, size_t numThreads )
+// This function queues up and runs the above CL kernel that writes the vertex
+// data
+cl_int run_cl_kernel(cl_kernel kernel, cl_command_queue queue, cl_mem stream0,
+                     cl_mem stream1, cl_int rowIdx, cl_event fenceEvent,
+                     size_t numThreads)
-    cl_int error = clSetKernelArg( kernel, 3, sizeof( rowIdx ), &rowIdx );
-    test_error( error, "Unable to set kernel arguments" );
+    cl_int error = clSetKernelArg(kernel, 3, sizeof(rowIdx), &rowIdx);
+    test_error(error, "Unable to set kernel arguments");
     clEventWrapper acqEvent1, acqEvent2, kernEvent, relEvent1, relEvent2;
-    int numEvents = ( fenceEvent != NULL ) ? 1 : 0;
-    cl_event *fence_evt = ( fenceEvent != NULL ) ? &fenceEvent : NULL;
+    int numEvents = (fenceEvent != NULL) ? 1 : 0;
+    cl_event *fence_evt = (fenceEvent != NULL) ? &fenceEvent : NULL;
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream0, numEvents, fence_evt, &acqEvent1 );
-    test_error( error, "Unable to acquire GL obejcts");
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream1, numEvents, fence_evt, &acqEvent2 );
-    test_error( error, "Unable to acquire GL obejcts");
+    error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream0, numEvents,
+                                             fence_evt, &acqEvent1);
+    test_error(error, "Unable to acquire GL obejcts");
+    error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream1, numEvents,
+                                             fence_evt, &acqEvent2);
+    test_error(error, "Unable to acquire GL obejcts");
-    cl_event evts[ 2 ] = { acqEvent1, acqEvent2 };
+    cl_event evts[2] = { acqEvent1, acqEvent2 };
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numThreads, NULL, 2, evts, &kernEvent );
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &numThreads, NULL, 2,
+                                   evts, &kernEvent);
+    test_error(error, "Unable to execute test kernel");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream0, 1, &kernEvent, &relEvent1 );
+    error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream0, 1, &kernEvent,
+                                             &relEvent1);
     test_error(error, "clEnqueueReleaseGLObjects failed");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream1, 1, &kernEvent, &relEvent2 );
+    error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream1, 1, &kernEvent,
+                                             &relEvent2);
     test_error(error, "clEnqueueReleaseGLObjects failed");
-    evts[ 0 ] = relEvent1;
-    evts[ 1 ] = relEvent2;
-    error = clWaitForEvents( 2, evts );
-    test_error( error, "Unable to wait for release events" );
+    evts[0] = relEvent1;
+    evts[1] = relEvent2;
+    error = clWaitForEvents(2, evts);
+    test_error(error, "Unable to wait for release events");
     return 0;
-class RunThread : public genericThread
+class RunThread : public genericThread {
     cl_kernel mKernel;
     cl_command_queue mQueue;
     cl_mem mStream0, mStream1;
@@ -272,34 +289,40 @@
     cl_event mFenceEvent;
     size_t mNumThreads;
-    RunThread( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1, size_t numThreads )
-    : mKernel( kernel ), mQueue( queue ), mStream0( stream0 ), mStream1( stream1 ), mNumThreads( numThreads )
-    {
-    }
+    RunThread(cl_kernel kernel, cl_command_queue queue, cl_mem stream0,
+              cl_mem stream1, size_t numThreads)
+        : mKernel(kernel), mQueue(queue), mStream0(stream0), mStream1(stream1),
+          mNumThreads(numThreads)
+    {}
-    void SetRunData( cl_int rowIdx, cl_event fenceEvent )
+    void SetRunData(cl_int rowIdx, cl_event fenceEvent)
         mRowIdx = rowIdx;
         mFenceEvent = fenceEvent;
-    virtual void * IRun( void )
+    virtual void *IRun(void)
-        cl_int error = run_cl_kernel( mKernel, mQueue, mStream0, mStream1, mRowIdx, mFenceEvent, mNumThreads );
+        cl_int error = run_cl_kernel(mKernel, mQueue, mStream0, mStream1,
+                                     mRowIdx, mFenceEvent, mNumThreads);
         return (void *)(uintptr_t)error;
-int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_queue queue, bool separateThreads, GLint rend_vs, GLint read_vs, cl_device_id rend_device )
+int test_fence_sync_single(cl_device_id device, cl_context context,
+                           cl_command_queue queue, bool separateThreads,
+                           GLint rend_vs, GLint read_vs,
+                           cl_device_id rend_device)
     int error;
     const int framebufferSize = 512;
-    if( !is_extension_available( device, "cl_khr_gl_event" ) )
+    if (!is_extension_available(device, "cl_khr_gl_event"))
-        log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" );
+        log_info("NOTE: cl_khr_gl_event extension not present on this device; "
+                 "skipping fence sync test\n");
         return 0;
@@ -312,10 +335,11 @@
     clGetPlatformIDs(0, NULL, &nplatforms);
     clGetPlatformIDs(1, &platform, NULL);
-    if (nplatforms > 1) {
+    if (nplatforms > 1)
+    {
         log_info("clGetPlatformIDs returned multiple values.  This is not "
-            "an error, but might result in obtaining incorrect function "
-            "pointers if you do not want the first returned platform.\n");
+                 "an error, but might result in obtaining incorrect function "
+                 "pointers if you do not want the first returned platform.\n");
         // Show them the platform name, in case it is a problem.
@@ -323,28 +347,35 @@
         char *name;
         clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &size);
-        name = (char*)malloc(size);
+        name = (char *)malloc(size);
         clGetPlatformInfo(platform, CL_PLATFORM_NAME, size, name, NULL);
         log_info("Using platform with name: %s \n", name);
-    clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncKHR");
-    if( clCreateEventFromGLsyncKHR_ptr == NULL )
+    clCreateEventFromGLsyncKHR_ptr =
+        (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(
+            platform, "clCreateEventFromGLsyncKHR");
+    if (clCreateEventFromGLsyncKHR_ptr == NULL)
-        log_error( "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR function not discovered!)\n" );
-        clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncAPPLE");
+        log_error("ERROR: Unable to run fence_sync test "
+                  "(clCreateEventFromGLsyncKHR function not discovered!)\n");
+        clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(
+                platform, "clCreateEventFromGLsyncAPPLE");
         return -1;
 #ifdef USING_ARB_sync
-    char *gl_version_str = (char*)glGetString( GL_VERSION );
+    char *gl_version_str = (char *)glGetString(GL_VERSION);
     float glCoreVersion;
     sscanf(gl_version_str, "%f", &glCoreVersion);
-    if( glCoreVersion < 3.0f )
+    if (glCoreVersion < 3.0f)
-        log_info( "OpenGL version %f does not support fence/sync! Skipping test.\n", glCoreVersion );
+        log_info(
+            "OpenGL version %f does not support fence/sync! Skipping test.\n",
+            glCoreVersion);
         return 0;
@@ -354,10 +385,13 @@
     GLint val, screen;
     CGLGetVirtualScreen(currCtx, &screen);
     CGLDescribePixelFormat(pixFmt, screen, kCGLPFAOpenGLProfile, &val);
-    if(val != kCGLOGLPVersion_3_2_Core)
+    if (val != kCGLOGLPVersion_3_2_Core)
-        log_error( "OpenGL context was not created with OpenGL version >= 3.0 profile even though platform supports it"
-                  "OpenGL profile %f does not support fence/sync! Skipping test.\n", glCoreVersion );
+        log_error(
+            "OpenGL context was not created with OpenGL version >= 3.0 profile "
+            "even though platform supports it"
+            "OpenGL profile %f does not support fence/sync! Skipping test.\n",
+            glCoreVersion);
         return -1;
@@ -365,7 +399,7 @@
     HDC hdc = wglGetCurrentDC();
     HGLRC hglrc = wglGetCurrentContext();
-    Display* dpy = glXGetCurrentDisplay();
+    Display *dpy = glXGetCurrentDisplay();
     GLXDrawable drawable = glXGetCurrentDrawable();
     GLXContext ctx = glXGetCurrentContext();
@@ -386,51 +420,66 @@
     GLint posLoc, colLoc;
     GLuint shaderprogram = createShaderProgram(&posLoc, &colLoc);
-    if(!shaderprogram)
+    if (!shaderprogram)
         log_error("Failed to create shader program\n");
         return -1;
-    float l = 0.0f; float r = framebufferSize;
-    float b = 0.0f; float t = framebufferSize;
+    float l = 0.0f;
+    float r = framebufferSize;
+    float b = 0.0f;
+    float t = framebufferSize;
-    float projMatrix[16] = { 2.0f/(r-l), 0.0f, 0.0f, 0.0f,
-        0.0f, 2.0f/(t-b), 0.0f, 0.0f,
-        0.0f, 0.0f, -1.0f, 0.0f,
-        -(r+l)/(r-l), -(t+b)/(t-b), 0.0f, 1.0f
-    };
+    float projMatrix[16] = { 2.0f / (r - l),
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             2.0f / (t - b),
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             -1.0f,
+                             0.0f,
+                             -(r + l) / (r - l),
+                             -(t + b) / (t - b),
+                             0.0f,
+                             1.0f };
     GLuint projMatLoc = glGetUniformLocation(shaderprogram, "projMatrix");
     glUniformMatrix4fv(projMatLoc, 1, 0, projMatrix);
-    // Note: the framebuffer is just the target to verify our results against, so we don't
-    // really care to go through all the possible formats in this case
+    // Note: the framebuffer is just the target to verify our results against,
+    // so we don't really care to go through all the possible formats in this
+    // case
     glFramebufferWrapper glFramebuffer;
     glRenderbufferWrapper glRenderbuffer;
-    error = CreateGLRenderbufferRaw( framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT,
-                                    GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
-                                    &glFramebuffer, &glRenderbuffer );
-    if( error != 0 )
-        return error;
+    error = CreateGLRenderbufferRaw(
+        framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA, GL_RGBA,
+        GL_UNSIGNED_INT_8_8_8_8_REV, &glFramebuffer, &glRenderbuffer);
+    if (error != 0) return error;
     GLuint vao;
     glGenVertexArrays(1, &vao);
     glBufferWrapper vtxBuffer, colorBuffer;
-    glGenBuffers( 1, &vtxBuffer );
-    glGenBuffers( 1, &colorBuffer );
+    glGenBuffers(1, &vtxBuffer);
+    glGenBuffers(1, &colorBuffer);
-    const int numHorizVertices = ( framebufferSize * 64 ) + 1;
+    const int numHorizVertices = (framebufferSize * 64) + 1;
-    glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer );
-    glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4,
+                 NULL, GL_STATIC_DRAW);
-    glBindBuffer( GL_ARRAY_BUFFER, colorBuffer );
-    glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, colorBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4,
+                 NULL, GL_STATIC_DRAW);
     // Now that the requisite objects are bound, we can attempt program
     // validation:
@@ -439,8 +488,9 @@
     GLint logLength, status;
     glGetProgramiv(shaderprogram, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetProgramInfoLog(shaderprogram, logLength, &logLength, log);
         log_info("Program validate log:\n%s", log);
@@ -455,125 +505,131 @@
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper streams[ 2 ];
+    clMemWrapper streams[2];
-    if( create_single_kernel_helper( context, &program, &kernel, 1, updateBuffersKernel, "update" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    updateBuffersKernel, "update"))
         return -1;
-    streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, vtxBuffer, &error );
-    test_error( error, "Unable to create CL buffer from GL vertex buffer" );
+    streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE,
+                                             vtxBuffer, &error);
+    test_error(error, "Unable to create CL buffer from GL vertex buffer");
-    streams[ 1 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, colorBuffer, &error );
-    test_error( error, "Unable to create CL buffer from GL color buffer" );
+    streams[1] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE,
+                                             colorBuffer, &error);
+    test_error(error, "Unable to create CL buffer from GL color buffer");
-    error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel arguments");
-    error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel arguments");
     cl_int horizWrap = (cl_int)framebufferSize;
-    error = clSetKernelArg( kernel, 2, sizeof( horizWrap ), &horizWrap );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 2, sizeof(horizWrap), &horizWrap);
+    test_error(error, "Unable to set kernel arguments");
-    glViewport( 0, 0, framebufferSize, framebufferSize );
-    glClearColor( 0, 0, 0, 0 );
-    glClear( GL_COLOR_BUFFER_BIT );
-    glClear( GL_DEPTH_BUFFER_BIT );
-    glDisable( GL_DEPTH_TEST );
-    glEnable( GL_BLEND );
-    glBlendFunc( GL_ONE, GL_ONE );
+    glViewport(0, 0, framebufferSize, framebufferSize);
+    glClearColor(0, 0, 0, 0);
+    glClear(GL_COLOR_BUFFER_BIT);
+    glClear(GL_DEPTH_BUFFER_BIT);
+    glDisable(GL_DEPTH_TEST);
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE);
     clEventWrapper fenceEvent;
     GLsync glFence = 0;
     // Do a loop through 8 different horizontal stripes against the framebuffer
-    RunThread thread( kernel, queue, streams[ 0 ], streams[ 1 ], (size_t)numHorizVertices );
+    RunThread thread(kernel, queue, streams[0], streams[1],
+                     (size_t)numHorizVertices);
-    for( int i = 0; i < 8; i++ )
+    for (int i = 0; i < 8; i++)
         // if current rendering device is not the compute device and
         // separateThreads == false which means compute is going on same
         // thread and we are using implicit synchronization (no GLSync obj used)
-        // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we need
-        // to wait for rendering to finish on other device before CL can start
-        // writing to CL/GL shared mem objects. When separateThreads is true i.e.
-        // we are using GLSync obj to synchronize then we dont need to call glFinish
-        // here since CL should wait for rendering on other device before this
-        // GLSync object to finish before it starts writing to shared mem object.
-        // Also rend_device == compute_device no need to call glFinish
-        if(rend_device != device && !separateThreads)
-            glFinish();
+        // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we
+        // need to wait for rendering to finish on other device before CL can
+        // start writing to CL/GL shared mem objects. When separateThreads is
+        // true i.e. we are using GLSync obj to synchronize then we dont need to
+        // call glFinish here since CL should wait for rendering on other device
+        // before this GLSync object to finish before it starts writing to
+        // shared mem object. Also rend_device == compute_device no need to call
+        // glFinish
+        if (rend_device != device && !separateThreads) glFinish();
-        if( separateThreads )
+        if (separateThreads)
-            if (fenceEvent != NULL)
-            {
-                clReleaseEvent(fenceEvent);
-                glDeleteSyncFunc(glFence);
-            }
+            glDeleteSyncFunc(glFence);
             glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-            fenceEvent = clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
+            fenceEvent =
+                clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
             test_error(error, "Unable to create CL event from GL fence");
-            // in case of explicit synchronization, we just wait for the sync object to complete
-            // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility
-            // to flush on the context on which glSync is created
+            // in case of explicit synchronization, we just wait for the sync
+            // object to complete in clEnqueueAcquireGLObject but we dont flush.
+            // Its application's responsibility to flush on the context on which
+            // glSync is created
-            thread.SetRunData( (cl_int)i, fenceEvent );
+            thread.SetRunData((cl_int)i, fenceEvent);
             error = (cl_int)(size_t)thread.Join();
-            error = run_cl_kernel( kernel, queue, streams[ 0 ], streams[ 1 ], (cl_int)i, fenceEvent, (size_t)numHorizVertices );
+            error =
+                run_cl_kernel(kernel, queue, streams[0], streams[1], (cl_int)i,
+                              fenceEvent, (size_t)numHorizVertices);
-        test_error( error, "Unable to run CL kernel" );
+        test_error(error, "Unable to run CL kernel");
-        glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer );
-        glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0);
-        glBindBuffer( GL_ARRAY_BUFFER, colorBuffer );
-        glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0);
-        glBindBuffer( GL_ARRAY_BUFFER, 0 );
+        glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer);
+        glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE,
+                              4 * sizeof(GLfloat), 0);
+        glBindBuffer(GL_ARRAY_BUFFER, colorBuffer);
+        glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE,
+                              4 * sizeof(GLfloat), 0);
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
-        glDrawArrays( GL_TRIANGLE_STRIP, 0, numHorizVertices * 2 );
+        glDrawArrays(GL_TRIANGLE_STRIP, 0, numHorizVertices * 2);
-        if( separateThreads )
+        if (separateThreads)
-            // If we're on the same thread, then we're testing implicit syncing, so we
-            // don't need the actual fence code
-            if( fenceEvent != NULL )
-            {
-                clReleaseEvent( fenceEvent );
-                glDeleteSyncFunc( glFence );
-            }
+            // If we're on the same thread, then we're testing implicit syncing,
+            // so we don't need the actual fence code
+            glDeleteSyncFunc(glFence);
-            glFence = glFenceSyncFunc( GL_SYNC_GPU_COMMANDS_COMPLETE, 0 );
-            fenceEvent = clCreateEventFromGLsyncKHR_ptr( context, glFence, &error );
-            test_error( error, "Unable to create CL event from GL fence" );
-            // in case of explicit synchronization, we just wait for the sync object to complete
-            // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility
-            // to flush on the context on which glSync is created
+            glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+            fenceEvent =
+                clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
+            test_error(error, "Unable to create CL event from GL fence");
+            // in case of explicit synchronization, we just wait for the sync
+            // object to complete in clEnqueueAcquireGLObject but we dont flush.
+            // Its application's responsibility to flush on the context on which
+            // glSync is created
-    if( glFence != 0 )
-        // Don't need the final release for fenceEvent, because the wrapper will take care of that
-        glDeleteSyncFunc( glFence );
+    if (glFence != 0)
+        // Don't need the final release for fenceEvent, because the wrapper will
+        // take care of that
+        glDeleteSyncFunc(glFence);
 #ifdef __APPLE__
     CGLSetVirtualScreen(CGLGetCurrentContext(), read_vs);
@@ -585,54 +641,62 @@
     // Grab the contents of the final framebuffer
-    BufferOwningPtr<char> resultData( ReadGLRenderbuffer( glFramebuffer, glRenderbuffer,
-                                                         GL_COLOR_ATTACHMENT0_EXT,
-                                                         GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar,
-                                                         framebufferSize, 128 ) );
+    BufferOwningPtr<char> resultData(ReadGLRenderbuffer(
+        glFramebuffer, glRenderbuffer, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA,
+        GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar, framebufferSize, 128));
-    // Check the contents now. We should end up with solid color bands 32 pixels high and the
-    // full width of the framebuffer, at values (128,128,128) due to the additive blending
-    for( int i = 0; i < 8; i++ )
+    // Check the contents now. We should end up with solid color bands 32 pixels
+    // high and the full width of the framebuffer, at values (128,128,128) due
+    // to the additive blending
+    for (int i = 0; i < 8; i++)
-        for( int y = 0; y < 4; y++ )
+        for (int y = 0; y < 4; y++)
-            // Note: coverage will be double because the 63-0 triangle overwrites again at the end of the pass
-            cl_uchar valA = ( ( ( i + 1 ) & 1 )      ) * numHorizVertices * 2 / framebufferSize;
-            cl_uchar valB = ( ( ( i + 1 ) & 2 ) >> 1 ) * numHorizVertices * 2 / framebufferSize;
-            cl_uchar valC = ( ( ( i + 1 ) & 4 ) >> 2 ) * numHorizVertices * 2 / framebufferSize;
+            // Note: coverage will be double because the 63-0 triangle
+            // overwrites again at the end of the pass
+            cl_uchar valA =
+                (((i + 1) & 1)) * numHorizVertices * 2 / framebufferSize;
+            cl_uchar valB =
+                (((i + 1) & 2) >> 1) * numHorizVertices * 2 / framebufferSize;
+            cl_uchar valC =
+                (((i + 1) & 4) >> 2) * numHorizVertices * 2 / framebufferSize;
-            cl_uchar *row = (cl_uchar *)&resultData[ ( i * 16 + y ) * framebufferSize * 4 ];
-            for( int x = 0; x < ( framebufferSize - 1 ) - 1; x++ )
+            cl_uchar *row =
+                (cl_uchar *)&resultData[(i * 16 + y) * framebufferSize * 4];
+            for (int x = 0; x < (framebufferSize - 1) - 1; x++)
-                if( ( row[ x * 4 ] != valA ) || ( row[ x * 4 + 1 ] != valB ) ||
-                   ( row[ x * 4 + 2 ] != valC ) )
+                if ((row[x * 4] != valA) || (row[x * 4 + 1] != valB)
+                    || (row[x * 4 + 2] != valC))
-                    log_error( "ERROR: Output framebuffer did not validate!\n" );
-                    DumpGLBuffer( GL_UNSIGNED_BYTE, framebufferSize, 128, resultData );
-                    log_error( "RUNS:\n" );
+                    log_error("ERROR: Output framebuffer did not validate!\n");
+                    DumpGLBuffer(GL_UNSIGNED_BYTE, framebufferSize, 128,
+                                 resultData);
+                    log_error("RUNS:\n");
                     uint32_t *p = (uint32_t *)(char *)resultData;
                     size_t a = 0;
-                    for( size_t t = 1; t < framebufferSize * framebufferSize; t++ )
+                    for (size_t t = 1; t < framebufferSize * framebufferSize;
+                         t++)
-                        if( p[ a ] != 0 )
+                        if (p[a] != 0)
-                            if( p[ t ] == 0 )
+                            if (p[t] == 0)
-                                log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1,
-                                          (int)( a % framebufferSize ), (int)( a / framebufferSize ),
-                                          (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ),
-                                          p[ a ] );
+                                log_error(
+                                    "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n",
+                                    a, t - 1, (int)(a % framebufferSize),
+                                    (int)(a / framebufferSize),
+                                    (int)((t - 1) % framebufferSize),
+                                    (int)((t - 1) / framebufferSize), p[a]);
                                 a = t;
-                            if( p[ t ] != 0 )
+                            if (p[t] != 0)
                                 a = t;
                     return -1;
@@ -645,46 +709,56 @@
     return 0;
-int test_fence_sync( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_fence_sync(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int numElements)
     GLint vs_count = 0;
     cl_device_id *device_list = NULL;
-    if( !is_extension_available( device, "cl_khr_gl_event" ) )
+    if (!is_extension_available(device, "cl_khr_gl_event"))
-        log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" );
+        log_info("NOTE: cl_khr_gl_event extension not present on this device; "
+                 "skipping fence sync test\n");
         return 0;
 #ifdef __APPLE__
     CGLContextObj ctx = CGLGetCurrentContext();
     CGLPixelFormatObj pix = CGLGetPixelFormat(ctx);
-    CGLError err = CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count);
+    CGLError err =
+        CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count);
-    device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*vs_count);
-    clGetGLContextInfoAPPLE(context, ctx, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, sizeof(cl_device_id)*vs_count, device_list, NULL);
+    device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * vs_count);
+    clGetGLContextInfoAPPLE(context, ctx,
+                            sizeof(cl_device_id) * vs_count, device_list, NULL);
-    // Need platform specific way of getting devices from CL context to which OpenGL can render
-    // If not available it can be replaced with clGetContextInfo with CL_CONTEXT_DEVICES
+    // Need platform specific way of getting devices from CL context to which
+    // OpenGL can render If not available it can be replaced with
+    // clGetContextInfo with CL_CONTEXT_DEVICES
     size_t device_cb;
-    cl_int err = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb);
-    if( err != CL_SUCCESS )
+    cl_int err =
+        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb);
+    if (err != CL_SUCCESS)
-      print_error( err, "Unable to get device count from context" );
-      return -1;
+        print_error(err, "Unable to get device count from context");
+        return -1;
     vs_count = (GLint)device_cb / sizeof(cl_device_id);
-    if (vs_count < 1) {
-      log_error("No devices found.\n");
-      return -1;
+    if (vs_count < 1)
+    {
+        log_error("No devices found.\n");
+        return -1;
-    device_list = (cl_device_id *) malloc(device_cb);
-    err = clGetContextInfo( context, CL_CONTEXT_DEVICES, device_cb, device_list, NULL);
-    if( err != CL_SUCCESS ) {
-      free(device_list);
-      print_error( err, "Unable to get device list from context" );
-      return -1;
+    device_list = (cl_device_id *)malloc(device_cb);
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, device_cb, device_list,
+                           NULL);
+    if (err != CL_SUCCESS)
+    {
+        free(device_list);
+        print_error(err, "Unable to get device list from context");
+        return -1;
@@ -695,30 +769,38 @@
     // Loop through all the devices capable to OpenGL rendering
     // and set them as current rendering target
-    for(rend_vs = 0; rend_vs < vs_count; rend_vs++)
+    for (rend_vs = 0; rend_vs < vs_count; rend_vs++)
         // Loop through all the devices and set them as current
         // compute target
-        for(read_vs = 0; read_vs < vs_count; read_vs++)
+        for (read_vs = 0; read_vs < vs_count; read_vs++)
-            cl_device_id rend_device = device_list[rend_vs], read_device = device_list[read_vs];
+            cl_device_id rend_device = device_list[rend_vs],
+                         read_device = device_list[read_vs];
             char rend_name[200], read_name[200];
-            clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name), rend_name, NULL);
-            clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name), read_name, NULL);
+            clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name),
+                            rend_name, NULL);
+            clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name),
+                            read_name, NULL);
-            log_info("Rendering on: %s, read back on: %s\n", rend_name, read_name);
-            error = test_fence_sync_single( device, context, queue, false, rend_vs, read_vs, rend_device );
+            log_info("Rendering on: %s, read back on: %s\n", rend_name,
+                     read_name);
+            error = test_fence_sync_single(device, context, queue, false,
+                                           rend_vs, read_vs, rend_device);
             any_failed |= error;
-            if( error != 0 )
-                log_error( "ERROR: Implicit syncing with GL sync events failed!\n\n" );
+            if (error != 0)
+                log_error(
+                    "ERROR: Implicit syncing with GL sync events failed!\n\n");
                 log_info("Implicit syncing Passed\n");
-            error = test_fence_sync_single( device, context, queue, true, rend_vs, read_vs, rend_device );
+            error = test_fence_sync_single(device, context, queue, true,
+                                           rend_vs, read_vs, rend_device);
             any_failed |= error;
-            if( error != 0 )
-                log_error( "ERROR: Explicit syncing with GL sync events failed!\n\n" );
+            if (error != 0)
+                log_error(
+                    "ERROR: Explicit syncing with GL sync events failed!\n\n");
                 log_info("Explicit syncing Passed\n");
diff --git a/test_conformance/gl/test_image_methods.cpp b/test_conformance/gl/test_image_methods.cpp
index 07f5b65..7d055fb 100644
--- a/test_conformance/gl/test_image_methods.cpp
+++ b/test_conformance/gl/test_image_methods.cpp
@@ -337,7 +337,6 @@
     return 0;
-    size_t pixelSize;
     int result = 0;
   GLenum depth_targets[] = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
   size_t ntargets = sizeof(depth_targets) / sizeof(depth_targets[0]);
@@ -378,7 +377,6 @@
     return 0;
-    size_t pixelSize;
   int result = 0;
   size_t ntargets = sizeof(targets) / sizeof(targets[0]);
diff --git a/test_conformance/gl/test_images_getinfo_common.cpp b/test_conformance/gl/test_images_getinfo_common.cpp
index 345b595..2322c26 100644
--- a/test_conformance/gl/test_images_getinfo_common.cpp
+++ b/test_conformance/gl/test_images_getinfo_common.cpp
@@ -86,10 +86,11 @@
   return CheckGLObjectInfo(streams[0], object_type, glTexture, glTarget, 0);
-static int test_image_format_get_info(
-    cl_context context, cl_command_queue queue,
-    size_t width, size_t height, size_t depth,
-    GLenum target, struct format* fmt, MTdata data)
+static int test_image_format_get_info(cl_context context,
+                                      cl_command_queue queue, size_t width,
+                                      size_t height, size_t depth,
+                                      GLenum target, const format *fmt,
+                                      MTdata data)
   int error = 0;
@@ -197,9 +198,11 @@
     &actualType, (void **)&outBuffer );
-int test_images_get_info_common( cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes )
+int test_images_get_info_common(cl_device_id device, cl_context context,
+                                cl_command_queue queue, const format *formats,
+                                size_t nformats, GLenum *targets,
+                                size_t ntargets, sizevec_t *sizes,
+                                size_t nsizes)
   int error = 0;
   RandomSeed seed(gRandomSeed);
diff --git a/test_conformance/gl/test_images_read_common.cpp b/test_conformance/gl/test_images_read_common.cpp
index 112c789..fe2a529 100644
--- a/test_conformance/gl/test_images_read_common.cpp
+++ b/test_conformance/gl/test_images_read_common.cpp
@@ -386,10 +386,9 @@
     width, height, depth, sampleNum, outFormat, outType, outResultBuffer );
-static int test_image_format_read(
-    cl_context context, cl_command_queue queue,
-    size_t width, size_t height, size_t depth,
-    GLenum target, struct format* fmt, MTdata data)
+static int test_image_format_read(cl_context context, cl_command_queue queue,
+                                  size_t width, size_t height, size_t depth,
+                                  GLenum target, const format *fmt, MTdata data)
   int error = 0;
@@ -645,9 +644,10 @@
-int test_images_read_common( cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes )
+int test_images_read_common(cl_device_id device, cl_context context,
+                            cl_command_queue queue, const format *formats,
+                            size_t nformats, GLenum *targets, size_t ntargets,
+                            sizevec_t *sizes, size_t nsizes)
   int error = 0;
   RandomSeed seed(gRandomSeed);
diff --git a/test_conformance/gl/test_images_write_common.cpp b/test_conformance/gl/test_images_write_common.cpp
index 9bbb257..0dba83b 100644
--- a/test_conformance/gl/test_images_write_common.cpp
+++ b/test_conformance/gl/test_images_write_common.cpp
@@ -427,7 +427,6 @@
 int supportsHalf(cl_context context, bool* supports_half)
   int error;
-  size_t  size;
   cl_uint numDev;
   error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL);
@@ -446,7 +445,6 @@
 int supportsMsaa(cl_context context, bool* supports_msaa)
   int error;
-  size_t  size;
   cl_uint numDev;
   error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL);
@@ -465,7 +463,6 @@
 int supportsDepth(cl_context context, bool* supports_depth)
   int error;
-  size_t  size;
   cl_uint numDev;
   error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL);
@@ -486,7 +483,6 @@
   GLenum internalFormat,  GLenum glType, ExplicitType type, MTdata d )
   int error;
-  int samples = 8;
   // If we're testing a half float format, then we need to determine the
   // rounding mode of this machine.  Punt if we fail to do so.
@@ -664,8 +660,9 @@
 // combination.
 int test_images_write_common(cl_device_id device, cl_context context,
-  cl_command_queue queue, struct format* formats, size_t nformats,
-  GLenum *targets, size_t ntargets, sizevec_t* sizes, size_t nsizes )
+                             cl_command_queue queue, const format *formats,
+                             size_t nformats, GLenum *targets, size_t ntargets,
+                             sizevec_t *sizes, size_t nsizes)
   int err = 0;
   int error = 0;
diff --git a/test_conformance/gles/CMakeLists.txt b/test_conformance/gles/CMakeLists.txt
index c76fe51..4f4ba53 100644
--- a/test_conformance/gles/CMakeLists.txt
+++ b/test_conformance/gles/CMakeLists.txt
@@ -18,3 +18,11 @@
+    target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE GLES3)
+    # Don't warn about using the portable "strdup" function.
+    target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE _CRT_NONSTDC_NO_DEPRECATE)
\ No newline at end of file
diff --git a/test_conformance/gles/main.cpp b/test_conformance/gles/main.cpp
index 644fa63..60e020d 100644
--- a/test_conformance/gles/main.cpp
+++ b/test_conformance/gles/main.cpp
@@ -320,8 +320,10 @@
       goto cleanup;
+#ifdef GLES3
     int argc_ = (first_32_testname) ? 1 + (argc - first_32_testname) : argc;
     const char** argv_ = (first_32_testname) ? &argv[first_32_testname-1] : argv;
     // Execute the tests.
       for( size_t i = 0; i < numDevices; i++ ) {
diff --git a/test_conformance/gles/setup_egl.cpp b/test_conformance/gles/setup_egl.cpp
index fe0f8ca..95a12a6 100644
--- a/test_conformance/gles/setup_egl.cpp
+++ b/test_conformance/gles/setup_egl.cpp
@@ -117,7 +117,8 @@
                 _platform, "clGetGLContextInfoKHR");
         if (GetGLContextInfo == NULL)
-            print_error(status, "clGetGLContextInfoKHR failed");
+            log_error("ERROR: clGetGLContextInfoKHR failed! (%s:%d)\n",
+                      __FILE__, __LINE__);
             return NULL;
@@ -128,7 +129,7 @@
             return NULL;
         dev_size /= sizeof(cl_device_id);
-        log_info("GL _context supports %d compute devices\n", dev_size);
+        log_info("GL _context supports %zu compute devices\n", dev_size);
         status =
             GetGLContextInfo(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR,
diff --git a/test_conformance/gles/test_buffers.cpp b/test_conformance/gles/test_buffers.cpp
index a2d6732..7371126 100644
--- a/test_conformance/gles/test_buffers.cpp
+++ b/test_conformance/gles/test_buffers.cpp
@@ -205,10 +205,10 @@
   if (validate_only) {
     int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) |
                   CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) );
-    for(i=0;i<3;i++)
+    for (i = 0; i < 3; i++)
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
     glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
@@ -285,10 +285,9 @@
         clP += get_explicit_type_size( vecType );
-    for(i=0;i<3;i++)
+    for (i = 0; i < 3; i++)
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
     glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
diff --git a/test_conformance/gles/test_fence_sync.cpp b/test_conformance/gles/test_fence_sync.cpp
index 0af91a4..968d969 100644
--- a/test_conformance/gles/test_fence_sync.cpp
+++ b/test_conformance/gles/test_fence_sync.cpp
@@ -570,10 +570,12 @@
                             if( p[ t ] == 0 )
-                                log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1,
-                                          (int)( a % framebufferSize ), (int)( a / framebufferSize ),
-                                          (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ),
-                                          p[ a ] );
+                                log_error(
+                                    "RUN: %zu to %zu (%d,%d to %d,%d) 0x%08x\n",
+                                    a, t - 1, (int)(a % framebufferSize),
+                                    (int)(a / framebufferSize),
+                                    (int)((t - 1) % framebufferSize),
+                                    (int)((t - 1) / framebufferSize), p[a]);
                                 a = t;
diff --git a/test_conformance/gles/test_images_2D.cpp b/test_conformance/gles/test_images_2D.cpp
index c1a17fc..f655402 100644
--- a/test_conformance/gles/test_images_2D.cpp
+++ b/test_conformance/gles/test_images_2D.cpp
@@ -369,7 +369,9 @@
+#ifdef __APPLE__
 #pragma mark -------------------- Write tests -------------------------
 int test_cl_image_write( cl_context context, cl_command_queue queue, cl_mem clImage,
diff --git a/test_conformance/gles/test_renderbuffer.cpp b/test_conformance/gles/test_renderbuffer.cpp
index 20127ac..0f6d289 100644
--- a/test_conformance/gles/test_renderbuffer.cpp
+++ b/test_conformance/gles/test_renderbuffer.cpp
@@ -197,7 +197,9 @@
+#ifdef __APPLE__
 #pragma mark -------------------- Write tests -------------------------
 int test_attach_renderbuffer_write_to_image( cl_context context, cl_command_queue queue, GLenum glTarget, GLuint glRenderbuffer,
                      size_t imageWidth, size_t imageHeight, cl_image_format *outFormat, ExplicitType *outType, MTdata d, void **outSourceBuffer )
diff --git a/test_conformance/half/Test_roundTrip.cpp b/test_conformance/half/Test_roundTrip.cpp
index 69fc7e4..1ab4093 100644
--- a/test_conformance/half/Test_roundTrip.cpp
+++ b/test_conformance/half/Test_roundTrip.cpp
@@ -14,6 +14,9 @@
 // limitations under the License.
 #include <string.h>
+#include <algorithm>
 #include "cl_utils.h"
 #include "tests.h"
 #include "harness/testHarness.h"
@@ -156,7 +159,7 @@
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
+    size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
     size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
     size_t stride = blockCount;
@@ -168,7 +171,7 @@
     for( i = 0; i < (uint64_t)lastCase; i += stride )
-        count = (uint32_t) MIN( blockCount, lastCase - i );
+        count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         //Init the input stream
         uint16_t *p = (uint16_t *)gIn_half;
diff --git a/test_conformance/half/Test_vLoadHalf.cpp b/test_conformance/half/Test_vLoadHalf.cpp
index 52867c2..e935401 100644
--- a/test_conformance/half/Test_vLoadHalf.cpp
+++ b/test_conformance/half/Test_vLoadHalf.cpp
@@ -17,6 +17,9 @@
 #include "harness/testHarness.h"
 #include <string.h>
+#include <algorithm>
 #include "cl_utils.h"
 #include "tests.h"
@@ -37,14 +40,12 @@
     const char *vector_size_names[]   = {"1", "2", "4", "8", "16", "3"};
     int minVectorSize = kMinVectorSize;
-    // There is no aligned scalar vloada_half in CL 1.1
-#if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__)
-    vlog("Note: testing vloada_half.\n");
-    if (aligned && minVectorSize == 0)
-        minVectorSize = 1;
-    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    // There is no aligned scalar vloada_half
+    if (aligned && minVectorSize == 0) minVectorSize = 1;
+    for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
         int effectiveVectorSize = g_arrVecSizes[vectorSize];
@@ -81,7 +82,7 @@
             "   size_t i = get_global_id(0);\n"
             "   f[i] = vloada_half3( i, p );\n"
-            "   ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n"
+            "   ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
@@ -431,7 +432,7 @@
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
+    size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
     size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
@@ -449,7 +450,7 @@
     for( i = 0; i < (uint64_t)lastCase; i += blockCount )
-        count = (uint32_t) MIN( blockCount, lastCase - i );
+        count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         //Init the input stream
         uint16_t *p = (uint16_t *)gIn_half;
diff --git a/test_conformance/half/Test_vStoreHalf.cpp b/test_conformance/half/Test_vStoreHalf.cpp
index c3a328a..591470f 100644
--- a/test_conformance/half/Test_vStoreHalf.cpp
+++ b/test_conformance/half/Test_vStoreHalf.cpp
@@ -18,6 +18,9 @@
 #include "harness/testHarness.h"
 #include <string.h>
+#include <algorithm>
 #include "cl_utils.h"
 #include "tests.h"
@@ -78,7 +81,7 @@
     cl_ushort *r = cri->r + off;
     f2h f = cri->f;
     cl_ulong i = cri->i + off;
-    cl_uint j, rr;
+    cl_uint j;
     if (off + count > lim)
         count = lim - off;
@@ -114,8 +117,7 @@
         return 0;
     for (j = 0; j < count; j++) {
-    if (s[j] == r[j])
-        continue;
+        if (s[j] == r[j]) continue;
         // Pass any NaNs
         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
@@ -186,8 +188,7 @@
         return 0;
     for (j = 0; j < count; j++) {
-    if (s[j] == r[j])
-        continue;
+        if (s[j] == r[j]) continue;
         // Pass any NaNs
         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
@@ -419,7 +420,9 @@
             "__kernel void test( __global float *p, __global half *f,\n"
             "                   uint extra_last_thread )\n"
-            "   __local ushort data[3*(", local_buf_size, "+1)];\n"
+            "   __local ushort data[3*(",
+            local_buf_size,
+            "+1)];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   size_t last_i = get_global_size(0)-1;\n"
@@ -429,9 +432,18 @@
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   } "
-            "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
-            "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
+            "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
+            "       extra_last_thread != 0) {\n"
+            "     adjust = 3-extra_last_thread;\n"
+            "   }\n"
+            "   async_event = async_work_group_copy(\n"
+            "       (__global ushort*)(f+3*(i-lid)),\n"
+            "       (__local ushort *)(&data[adjust]),\n"
+            "       lsize*3-adjust, 0);\n" // investigate later
             "   wait_group_events(1, &async_event);\n"
@@ -521,7 +533,9 @@
             "__kernel void test( __global double *p, __global half *f,\n"
             "                   uint extra_last_thread )\n"
-            "   __local ushort data[3*(", local_buf_size, "+1)];\n"
+            "   __local ushort data[3*(",
+            local_buf_size,
+            "+1)];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   size_t last_i = get_global_size(0)-1;\n"
@@ -531,15 +545,23 @@
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   }\n "
-            "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
-            "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
+            "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
+            "       extra_last_thread != 0) {\n"
+            "     adjust = 3-extra_last_thread;\n"
+            "   }\n"
+            "   async_event = async_work_group_copy(\n"
+            "       (__global ushort *)(f+3*(i-lid)),\n"
+            "       (__local ushort *)(&data[adjust]),\n"
+            "       lsize*3-adjust, 0);\n" // investigate later
             "   wait_group_events(1, &async_event);\n"
         if(g_arrVecSizes[vectorSize] == 3) {
             programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
         } else {
@@ -674,7 +696,7 @@
     } // end for vector size
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
+    size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
     uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
     size_t stride = blockCount;
@@ -726,7 +748,7 @@
     for( i = 0; i < lastCase; i += stride )
-        count = (cl_uint) MIN( blockCount, lastCase - i );
+        count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
         dref.i = i;
@@ -1272,7 +1294,7 @@
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
+    size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize;
     uint64_t lastCase = 1ULL << (8*sizeof(float));
     size_t stride = blockCount;
@@ -1323,7 +1345,7 @@
     for( i = 0; i < (uint64_t)lastCase; i += stride )
-        count = (cl_uint) MIN( blockCount, lastCase - i );
+        count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
         dref.i = i;
diff --git a/test_conformance/half/main.cpp b/test_conformance/half/main.cpp
index 6600cc5..6bc7db9 100644
--- a/test_conformance/half/main.cpp
+++ b/test_conformance/half/main.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -131,8 +131,7 @@
 static int ParseArgs( int argc, const char **argv )
     int i;
-    argList = (const char **)calloc( argc - 1, sizeof( char*) );
+    argList = (const char **)calloc(argc, sizeof(char *));
     if( NULL == argList )
         vlog_error( "Failed to allocate memory for argList.\n" );
@@ -222,7 +221,6 @@
       gWimpyMode = 1;
-    vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );
     if( gWimpyMode )
@@ -248,4 +246,3 @@
         vlog("\t\t%s\n", test_list[i].name );
diff --git a/test_conformance/images/clCopyImage/test_copy_1D.cpp b/test_conformance/images/clCopyImage/test_copy_1D.cpp
index 2c996c7..0f6f3ce 100644
--- a/test_conformance/images/clCopyImage/test_copy_1D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_1D.cpp
@@ -113,6 +113,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_1D_array.cpp b/test_conformance/images/clCopyImage/test_copy_1D_array.cpp
index 0b61693..f0b610b 100644
--- a/test_conformance/images/clCopyImage/test_copy_1D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_1D_array.cpp
@@ -118,6 +118,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_2D.cpp b/test_conformance/images/clCopyImage/test_copy_2D.cpp
index 1a69a1f..448b47f 100644
--- a/test_conformance/images/clCopyImage/test_copy_2D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D.cpp
@@ -125,6 +125,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp
index eb6dd55..1819d87 100644
--- a/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp
@@ -224,6 +224,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp b/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp
index 8a56c95..4ab6b42 100644
--- a/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp
@@ -230,6 +230,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_2D_array.cpp
index 6327ba5..3376bf9 100644
--- a/test_conformance/images/clCopyImage/test_copy_2D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D_array.cpp
@@ -71,6 +71,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_3D.cpp b/test_conformance/images/clCopyImage/test_copy_3D.cpp
index da6731d..cdfdcce 100644
--- a/test_conformance/images/clCopyImage/test_copy_3D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_3D.cpp
@@ -57,6 +57,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp
index c098f64..1da1e47 100644
--- a/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp
@@ -251,6 +251,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp
index 026916e..3bd1b6e 100644
--- a/test_conformance/images/clCopyImage/test_copy_generic.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp
@@ -228,6 +228,11 @@
         size_t mappedSlicePad = mappedSlice - (mappedRow * height);
+        // For 1Darray, the height variable actually contains the arraysize,
+        // so it can't be used for calculating the slice padding.
+        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+            mappedSlicePad = mappedSlice - (mappedRow * 1);
         // Copy the image.
         size_t scanlineSize = row_pitch_lod;
         size_t sliceSize = slice_pitch_lod - scanlineSize * height;
@@ -547,18 +552,19 @@
             if( memcmp( sourcePtr, destPtr, scanlineSize ) != 0 )
-                // Find the first missing pixel
+                // Find the first differing pixel
                 size_t pixel_size = get_pixel_size( dstImageInfo->format );
-                size_t where = 0;
-                for( where = 0; where < dstImageInfo->width; where++ )
-                    if( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
-                        break;
+                size_t where =
+                    compare_scanlines(dstImageInfo, sourcePtr, destPtr);
-                print_first_pixel_difference_error(
-                    where, sourcePtr + pixel_size * where,
-                    destPtr + pixel_size * where, dstImageInfo, y,
-                    dstImageInfo->depth);
-                return -1;
+                if (where < dstImageInfo->width)
+                {
+                    print_first_pixel_difference_error(
+                        where, sourcePtr + pixel_size * where,
+                        destPtr + pixel_size * where, dstImageInfo, y,
+                        dstImageInfo->depth);
+                    return -1;
+                }
             sourcePtr += rowPitch;
             if((dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY || dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D))
diff --git a/test_conformance/images/clFillImage/test_fill_1D.cpp b/test_conformance/images/clFillImage/test_fill_1D.cpp
index c3f2318..b1550bf 100644
--- a/test_conformance/images/clFillImage/test_fill_1D.cpp
+++ b/test_conformance/images/clFillImage/test_fill_1D.cpp
@@ -80,6 +80,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if ( gTestSmallImages )
diff --git a/test_conformance/images/clFillImage/test_fill_1D_array.cpp b/test_conformance/images/clFillImage/test_fill_1D_array.cpp
index b4347a4..be32ec6 100644
--- a/test_conformance/images/clFillImage/test_fill_1D_array.cpp
+++ b/test_conformance/images/clFillImage/test_fill_1D_array.cpp
@@ -83,6 +83,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if ( gTestSmallImages )
diff --git a/test_conformance/images/clFillImage/test_fill_2D.cpp b/test_conformance/images/clFillImage/test_fill_2D.cpp
index bb66fc2..e941abc 100644
--- a/test_conformance/images/clFillImage/test_fill_2D.cpp
+++ b/test_conformance/images/clFillImage/test_fill_2D.cpp
@@ -83,6 +83,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if ( gTestSmallImages )
diff --git a/test_conformance/images/clFillImage/test_fill_2D_array.cpp b/test_conformance/images/clFillImage/test_fill_2D_array.cpp
index 3265aab..38196cf 100644
--- a/test_conformance/images/clFillImage/test_fill_2D_array.cpp
+++ b/test_conformance/images/clFillImage/test_fill_2D_array.cpp
@@ -87,6 +87,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if ( gTestSmallImages )
diff --git a/test_conformance/images/clFillImage/test_fill_3D.cpp b/test_conformance/images/clFillImage/test_fill_3D.cpp
index 9db0ac7..0b8e4e5 100644
--- a/test_conformance/images/clFillImage/test_fill_3D.cpp
+++ b/test_conformance/images/clFillImage/test_fill_3D.cpp
@@ -87,6 +87,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     if ( gTestSmallImages )
diff --git a/test_conformance/images/clFillImage/test_fill_generic.cpp b/test_conformance/images/clFillImage/test_fill_generic.cpp
index 59bf24a..6cd6beb 100644
--- a/test_conformance/images/clFillImage/test_fill_generic.cpp
+++ b/test_conformance/images/clFillImage/test_fill_generic.cpp
@@ -468,27 +468,19 @@
         for ( size_t y = 0; y < secondDim; y++ )
-            // If the data type is 101010 ignore bits 31 and 32 when comparing the row
-            if (imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010) {
-              for (size_t w=0;w!=scanlineSize/4;++w) {
-                ((cl_uint*)sourcePtr)[w] &= 0x3FFFFFFF;
-                ((cl_uint*)destPtr)[w] &= 0x3FFFFFFF;
-              }
-            }
             if (memcmp( sourcePtr, destPtr, scanlineSize ) != 0)
-                // Find the first missing pixel
+                // Find the first differing pixel
                 size_t pixel_size = get_pixel_size( imageInfo->format );
-                size_t where = 0;
-                for ( where = 0; where < imageInfo->width; where++ )
-                    if ( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
-                        break;
+                size_t where = compare_scanlines(imageInfo, sourcePtr, destPtr);
-                print_first_pixel_difference_error(
-                    where, sourcePtr + pixel_size * where,
-                    destPtr + pixel_size * where, imageInfo, y, thirdDim);
-                return -1;
+                if (where < imageInfo->width)
+                {
+                    print_first_pixel_difference_error(
+                        where, sourcePtr + pixel_size * where,
+                        destPtr + pixel_size * where, imageInfo, y, thirdDim);
+                    return -1;
+                }
             total_matched += scanlineSize;
diff --git a/test_conformance/images/clGetInfo/test_1D.cpp b/test_conformance/images/clGetInfo/test_1D.cpp
index 0d704b8..7e04485 100644
--- a/test_conformance/images/clGetInfo/test_1D.cpp
+++ b/test_conformance/images/clGetInfo/test_1D.cpp
@@ -46,6 +46,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clGetInfo/test_1D_2D_array.cpp b/test_conformance/images/clGetInfo/test_1D_2D_array.cpp
index 447fc7c..c35bf22 100644
--- a/test_conformance/images/clGetInfo/test_1D_2D_array.cpp
+++ b/test_conformance/images/clGetInfo/test_1D_2D_array.cpp
@@ -44,6 +44,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
@@ -168,6 +169,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clGetInfo/test_2D.cpp b/test_conformance/images/clGetInfo/test_2D.cpp
index 74a6012..764b186 100644
--- a/test_conformance/images/clGetInfo/test_2D.cpp
+++ b/test_conformance/images/clGetInfo/test_2D.cpp
@@ -285,6 +285,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clGetInfo/test_3D.cpp b/test_conformance/images/clGetInfo/test_3D.cpp
index af5062e..e126186 100644
--- a/test_conformance/images/clGetInfo/test_3D.cpp
+++ b/test_conformance/images/clGetInfo/test_3D.cpp
@@ -47,6 +47,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clReadWriteImage/test_read_1D.cpp b/test_conformance/images/clReadWriteImage/test_read_1D.cpp
index eef5bf4..2d94dc8 100644
--- a/test_conformance/images/clReadWriteImage/test_read_1D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_1D.cpp
@@ -81,7 +81,6 @@
   for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-    float lod_float = (float) lod;
     origin[1] = lod;
     size_t width_lod, row_pitch_lod;
@@ -90,14 +89,17 @@
     region[0] = width_lod;
-    if ( gDebugTrace )
-      if ( gTestMipmaps) {
-        log_info(" - Working at mipLevel :%llu\n", (unsigned long long)lod);
-      }
-      error = clEnqueueWriteImage(queue, image, CL_FALSE,
-        origin, region, ( gEnablePitch ? row_pitch_lod : 0 ), 0,
-        (char*)imageValues + imgValMipLevelOffset, 0, NULL, NULL);
-      if (error != CL_SUCCESS) {
+    if (gDebugTrace)
+        if (gTestMipmaps)
+        {
+            log_info(" - Working at mipLevel :%llu\n", (unsigned long long)lod);
+        }
+    error = clEnqueueWriteImage(queue, image, CL_FALSE, origin, region,
+                                (gEnablePitch ? row_pitch_lod : 0), 0,
+                                (char *)imageValues + imgValMipLevelOffset, 0,
+                                NULL, NULL);
+    if (error != CL_SUCCESS)
+    {
         log_error( "ERROR: Unable to write to 1D image of size %d \n", (int)width_lod );
         return -1;
@@ -185,6 +187,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
index 5d5c288..cc90204 100644
--- a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
@@ -82,7 +82,6 @@
     for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-        float lod_float = (float) lod;
         size_t width_lod, row_pitch_lod, slice_pitch_lod;
         if( gTestMipmaps )
             origin[2] = lod;
@@ -192,6 +191,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clReadWriteImage/test_read_2D.cpp b/test_conformance/images/clReadWriteImage/test_read_2D.cpp
index fb2e794..b610287 100644
--- a/test_conformance/images/clReadWriteImage/test_read_2D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_2D.cpp
@@ -81,7 +81,6 @@
     for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-        float lod_float = (float) lod;
         origin[2] = lod;
         size_t width_lod, height_lod, row_pitch_lod;
@@ -195,6 +194,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
index d0113bb..401b0e4 100644
--- a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
@@ -83,9 +83,8 @@
     for(size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-        float lod_float = (float) lod;
         origin[3] = lod;
-        size_t width_lod, height_lod, depth_lod, row_pitch_lod, slice_pitch_lod;
+        size_t width_lod, height_lod, row_pitch_lod, slice_pitch_lod;
         width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
         height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
@@ -170,6 +169,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/clReadWriteImage/test_read_3D.cpp b/test_conformance/images/clReadWriteImage/test_read_3D.cpp
index 2dcd243..ced04ab 100644
--- a/test_conformance/images/clReadWriteImage/test_read_3D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_3D.cpp
@@ -83,7 +83,6 @@
     for(size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-        float lod_float = (float) lod;
         origin[3] = lod;
         size_t width_lod, height_lod, depth_lod, row_pitch_lod, slice_pitch_lod;
@@ -175,6 +174,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
         memSize = (cl_ulong)SIZE_MAX;
+        maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_image_methods/test_1D.cpp b/test_conformance/images/kernel_image_methods/test_1D.cpp
index 0059d4c..934e78b 100644
--- a/test_conformance/images/kernel_image_methods/test_1D.cpp
+++ b/test_conformance/images/kernel_image_methods/test_1D.cpp
@@ -171,6 +171,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_image_methods/test_1D_array.cpp b/test_conformance/images/kernel_image_methods/test_1D_array.cpp
index 797161c..a824f08 100644
--- a/test_conformance/images/kernel_image_methods/test_1D_array.cpp
+++ b/test_conformance/images/kernel_image_methods/test_1D_array.cpp
@@ -181,6 +181,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_image_methods/test_2D.cpp b/test_conformance/images/kernel_image_methods/test_2D.cpp
index b0d4a70..07f8d92 100644
--- a/test_conformance/images/kernel_image_methods/test_2D.cpp
+++ b/test_conformance/images/kernel_image_methods/test_2D.cpp
@@ -232,6 +232,7 @@
   if (memSize > (cl_ulong)SIZE_MAX) {
     memSize = (cl_ulong)SIZE_MAX;
+    maxAllocSize = (cl_ulong)SIZE_MAX;
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_read_write/CMakeLists.txt b/test_conformance/images/kernel_read_write/CMakeLists.txt
index 595f024..ccd678c 100644
--- a/test_conformance/images/kernel_read_write/CMakeLists.txt
+++ b/test_conformance/images/kernel_read_write/CMakeLists.txt
@@ -14,8 +14,14 @@
+    test_cl_ext_image_requirements_info.cpp
+    test_cl_ext_image_from_buffer.cpp
+# Make unused variables not fatal in this module; see
diff --git a/test_conformance/images/kernel_read_write/main.cpp b/test_conformance/images/kernel_read_write/main.cpp
index 31dceb3..0a93a97 100644
--- a/test_conformance/images/kernel_read_write/main.cpp
+++ b/test_conformance/images/kernel_read_write/main.cpp
@@ -53,6 +53,43 @@
 extern int test_image_set( cl_device_id device, cl_context context, cl_command_queue queue, test_format_set_fn formatTestFn, cl_mem_object_type imageType );
+extern int cl_image_requirements_size_ext_negative(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue);
+extern int cl_image_requirements_size_ext_consistency(cl_device_id device,
+                                                      cl_context context,
+                                                      cl_command_queue queue);
+extern int clGetImageRequirementsInfoEXT_negative(cl_device_id device,
+                                                  cl_context context,
+                                                  cl_command_queue queue);
+extern int cl_image_requirements_max_val_ext_negative(cl_device_id device,
+                                                      cl_context context,
+                                                      cl_command_queue queue);
+extern int cl_image_requirements_max_val_ext_positive(cl_device_id device,
+                                                      cl_context context,
+                                                      cl_command_queue queue);
+extern int image2d_from_buffer_positive(cl_device_id device, cl_context context,
+                                        cl_command_queue queue);
+extern int memInfo_image_from_buffer_positive(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue);
+extern int imageInfo_image_from_buffer_positive(cl_device_id device,
+                                                cl_context context,
+                                                cl_command_queue queue);
+extern int image_from_buffer_alignment_negative(cl_device_id device,
+                                                cl_context context,
+                                                cl_command_queue queue);
+extern int image_from_small_buffer_negative(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue);
+extern int image_from_buffer_fill_positive(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue);
+extern int image_from_buffer_read_positive(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue);
 /** read_write images only support sampler-less read buildt-ins which require special settings
   * for some global parameters. This pair of functions temporarily overwrite those global parameters
   * and then recover them after completing a read_write test.
@@ -246,12 +283,108 @@
     return doTest( device, context, queue, CL_MEM_OBJECT_IMAGE2D_ARRAY );
+int test_cl_image_requirements_size_ext_negative(cl_device_id device,
+                                                 cl_context context,
+                                                 cl_command_queue queue,
+                                                 int num_elements)
+    return cl_image_requirements_size_ext_negative(device, context, queue);
+int test_cl_image_requirements_size_ext_consistency(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements)
+    return cl_image_requirements_size_ext_consistency(device, context, queue);
+int test_clGetImageRequirementsInfoEXT_negative(cl_device_id device,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements)
+    return clGetImageRequirementsInfoEXT_negative(device, context, queue);
+int test_cl_image_requirements_max_val_ext_negative(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements)
+    return cl_image_requirements_max_val_ext_negative(device, context, queue);
+int test_cl_image_requirements_max_val_ext_positive(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements)
+    return cl_image_requirements_max_val_ext_positive(device, context, queue);
+int test_image2d_from_buffer_positive(cl_device_id device, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+    return image2d_from_buffer_positive(device, context, queue);
+int test_memInfo_image_from_buffer_positive(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements)
+    return memInfo_image_from_buffer_positive(device, context, queue);
+int test_imageInfo_image_from_buffer_positive(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements)
+    return imageInfo_image_from_buffer_positive(device, context, queue);
+int test_image_from_buffer_alignment_negative(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements)
+    return image_from_buffer_alignment_negative(device, context, queue);
+int test_image_from_small_buffer_negative(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
+    return image_from_small_buffer_negative(device, context, queue);
+int test_image_from_buffer_fill_positive(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements)
+    return image_from_buffer_fill_positive(device, context, queue);
+int test_image_from_buffer_read_positive(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements)
+    return image_from_buffer_read_positive(device, context, queue);
 test_definition test_list[] = {
-    ADD_TEST( 1D ),
-    ADD_TEST( 2D ),
-    ADD_TEST( 3D ),
-    ADD_TEST( 1Darray ),
-    ADD_TEST( 2Darray ),
+    ADD_TEST(1D),
+    ADD_TEST(2D),
+    ADD_TEST(3D),
+    ADD_TEST(1Darray),
+    ADD_TEST(2Darray),
+    ADD_TEST_VERSION(cl_image_requirements_size_ext_negative, Version(3, 0)),
+    ADD_TEST_VERSION(cl_image_requirements_size_ext_consistency, Version(3, 0)),
+    ADD_TEST_VERSION(clGetImageRequirementsInfoEXT_negative, Version(3, 0)),
+    ADD_TEST_VERSION(cl_image_requirements_max_val_ext_negative, Version(3, 0)),
+    ADD_TEST_VERSION(cl_image_requirements_max_val_ext_positive, Version(3, 0)),
+    ADD_TEST_VERSION(image2d_from_buffer_positive, Version(3, 0)),
+    ADD_TEST_VERSION(memInfo_image_from_buffer_positive, Version(3, 0)),
+    ADD_TEST_VERSION(imageInfo_image_from_buffer_positive, Version(3, 0)),
+    ADD_TEST_VERSION(image_from_buffer_alignment_negative, Version(3, 0)),
+    ADD_TEST_VERSION(image_from_small_buffer_negative, Version(3, 0)),
+    ADD_TEST_VERSION(image_from_buffer_fill_positive, Version(3, 0)),
+    ADD_TEST_VERSION(image_from_buffer_read_positive, Version(3, 0)),
 const int test_num = ARRAY_SIZE( test_list );
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
new file mode 100644
index 0000000..c664633
--- /dev/null
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define TEST_IMAGE_SIZE 20
+#define GET_EXTENSION_FUNC(platform, function_name)                            \
+    function_name##_fn function_name = reinterpret_cast<function_name##_fn>(   \
+        clGetExtensionFunctionAddressForPlatform(platform, #function_name));   \
+    if (function_name == nullptr)                                              \
+    {                                                                          \
+        return TEST_FAIL;                                                      \
+    }                                                                          \
+    do                                                                         \
+    {                                                                          \
+    } while (false)
+static inline size_t aligned_size(size_t size, size_t alignment)
+    return (size + alignment - 1) & ~(alignment - 1);
+static inline void* aligned_ptr(void* ptr, size_t alignment)
+    return (void*)(((uintptr_t)ptr + alignment - 1) & ~(alignment - 1));
+static inline size_t get_format_size(cl_context context,
+                                     cl_image_format* format,
+                                     cl_mem_object_type imageType,
+                                     cl_mem_flags flags)
+    cl_image_desc image_desc = { 0 };
+    image_desc.image_type = imageType;
+    /* Size 1 only to query element size */
+    image_desc.image_width = 1;
+    if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
+        && CL_MEM_OBJECT_IMAGE1D != imageType)
+    {
+        image_desc.image_height = 1;
+    }
+    if (CL_MEM_OBJECT_IMAGE3D == imageType
+        || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+    {
+        image_desc.image_depth = 1;
+    }
+    if (CL_MEM_OBJECT_IMAGE1D_ARRAY == imageType
+        || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+    {
+        image_desc.image_array_size = 1;
+    }
+    cl_int error = 0;
+    cl_mem buffer;
+    if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    {
+        buffer = clCreateBuffer(context, flags,
+                                get_pixel_size(format) * image_desc.image_width,
+                                NULL, &error);
+        test_error(error, "Unable to create buffer");
+        image_desc.buffer = buffer;
+    }
+    cl_mem image =
+        clCreateImage(context, flags, format, &image_desc, nullptr, &error);
+    test_error(error, "Unable to create image");
+    size_t element_size = 0;
+    error = clGetImageInfo(image, CL_IMAGE_ELEMENT_SIZE, sizeof(element_size),
+                           &element_size, nullptr);
+    test_error(error, "Error clGetImageInfo");
+    error = clReleaseMemObject(image);
+    test_error(error, "Unable to release image");
+    if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    {
+        error = clReleaseMemObject(buffer);
+        test_error(error, "Unable to release buffer");
+    }
+    return element_size;
+static inline void image_desc_init(cl_image_desc* desc,
+                                   cl_mem_object_type imageType)
+    desc->image_type = imageType;
+    desc->image_width = TEST_IMAGE_SIZE;
+    if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
+        && CL_MEM_OBJECT_IMAGE1D != imageType)
+    {
+        desc->image_height = TEST_IMAGE_SIZE;
+    }
+    if (CL_MEM_OBJECT_IMAGE3D == imageType
+        || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+    {
+        desc->image_depth = TEST_IMAGE_SIZE;
+    }
+    if (CL_MEM_OBJECT_IMAGE1D_ARRAY == imageType
+        || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+    {
+        desc->image_array_size = TEST_IMAGE_SIZE;
+    }
\ No newline at end of file
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp
new file mode 100644
index 0000000..2ce33a1
--- /dev/null
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp
@@ -0,0 +1,1013 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "../testBase.h"
+#include "../common.h"
+#include "test_cl_ext_image_buffer.hpp"
+static int get_image_requirement_alignment(
+    cl_device_id device, cl_context context, cl_mem_flags flags,
+    const cl_image_format* image_format, const cl_image_desc* image_desc,
+    size_t* row_pitch_alignment, size_t* slice_pitch_alignment,
+    size_t* base_address_alignment)
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    cl_int err = CL_SUCCESS;
+    if (nullptr != row_pitch_alignment)
+    {
+        err = clGetImageRequirementsInfoEXT(
+            context, nullptr, flags, image_format, image_desc,
+            sizeof(*row_pitch_alignment), row_pitch_alignment, nullptr);
+        test_error(err, "Error getting alignment");
+    }
+    if (nullptr != slice_pitch_alignment && CL_SUCCESS == err)
+    {
+        err = clGetImageRequirementsInfoEXT(
+            context, nullptr, flags, image_format, image_desc,
+            sizeof(*slice_pitch_alignment), slice_pitch_alignment, nullptr);
+        test_error(err, "Error getting alignment");
+    }
+    if (nullptr != base_address_alignment && CL_SUCCESS == err)
+    {
+        err = clGetImageRequirementsInfoEXT(
+            context, nullptr, flags, image_format, image_desc,
+            sizeof(*base_address_alignment), base_address_alignment, nullptr);
+        test_error(err, "Error getting alignment");
+    }
+    return TEST_PASS;
+ * Consistency with alignment requirements as returned by
+ * cl_khr_image2d_from_buffer Check that the returned values for
+ * are correct.
+ */
+int image2d_from_buffer_positive(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+    if (!is_extension_available(device, "cl_khr_image2d_from_buffer"))
+    {
+        printf("Extension cl_khr_image2d_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            cl_uint row_pitch_alignment_2d = 0;
+            cl_int err =
+                clGetDeviceInfo(device, CL_DEVICE_IMAGE_PITCH_ALIGNMENT,
+                                sizeof(row_pitch_alignment_2d),
+                                &row_pitch_alignment_2d, nullptr);
+            test_error(err, "Error clGetDeviceInfo");
+            cl_uint base_address_alignment_2d = 0;
+            err =
+                clGetDeviceInfo(device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT,
+                                sizeof(base_address_alignment_2d),
+                                &base_address_alignment_2d, nullptr);
+            test_error(err, "Error clGetDeviceInfo");
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                size_t row_pitch_alignment = 0;
+                size_t base_address_alignment = 0;
+                int get_error = get_image_requirement_alignment(
+                    device, context, 0, &format, &image_desc,
+                    &row_pitch_alignment, nullptr, &base_address_alignment);
+                if (TEST_PASS != get_error)
+                {
+                    return get_error;
+                }
+                const size_t element_size =
+                    get_format_size(context, &format, imageType, flag);
+                /*  Alignements in pixels vs bytes */
+                if (base_address_alignment
+                    > base_address_alignment_2d * element_size)
+                {
+                    test_fail("Unexpected base_address_alignment");
+                }
+                if (row_pitch_alignment > row_pitch_alignment_2d * element_size)
+                {
+                    test_fail("Unexpected row_pitch_alignment");
+                }
+            }
+        }
+    }
+    return TEST_PASS;
+ * Test clGetMemObjectInfo
+ * Check that CL_MEM_ASSOCIATED_MEMOBJECT correctly returns the buffer that was
+ * used.
+ */
+int memInfo_image_from_buffer_positive(cl_device_id device, cl_context context,
+                                       cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                size_t row_pitch_alignment = 0;
+                size_t slice_pitch_alignment = 0;
+                int get_error = get_image_requirement_alignment(
+                    device, context, 0, &format, &image_desc,
+                    &row_pitch_alignment, &slice_pitch_alignment, nullptr);
+                if (TEST_PASS != get_error)
+                {
+                    return get_error;
+                }
+                const size_t element_size =
+                    get_format_size(context, &format, imageType, flag);
+                const size_t row_pitch = aligned_size(
+                    TEST_IMAGE_SIZE * element_size, row_pitch_alignment);
+                const size_t slice_pitch = aligned_size(
+                    row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment);
+                const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE;
+                cl_int err = CL_SUCCESS;
+                cl_mem buffer =
+                    clCreateBuffer(context, flag, buffer_size, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                image_desc.buffer = buffer;
+                cl_mem image_buffer = clCreateImage(context, flag, &format,
+                                                    &image_desc, nullptr, &err);
+                test_error(err, "Unable to create image");
+                cl_mem returned_buffer;
+                err = clGetMemObjectInfo(
+                    image_buffer, CL_MEM_ASSOCIATED_MEMOBJECT,
+                    sizeof(returned_buffer), &returned_buffer, nullptr);
+                test_error(err, "Error clGetMemObjectInfo");
+                if (returned_buffer != buffer)
+                {
+                    test_fail("Unexpected CL_MEM_ASSOCIATED_MEMOBJECT buffer");
+                }
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Unable to release buffer");
+                err = clReleaseMemObject(image_buffer);
+                test_error(err, "Unable to release image");
+            }
+        }
+    }
+    return TEST_PASS;
+ * Test clGetImageInfo
+ * Check that the returned values for CL_IMAGE_ROW_PITCH and
+ * CL_IMAGE_SLICE_PITCH are correct.
+ */
+int imageInfo_image_from_buffer_positive(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                size_t row_pitch_alignment = 0;
+                size_t slice_pitch_alignment = 0;
+                int get_error = get_image_requirement_alignment(
+                    device, context, 0, &format, &image_desc,
+                    &row_pitch_alignment, &slice_pitch_alignment, nullptr);
+                if (TEST_PASS != get_error)
+                {
+                    return get_error;
+                }
+                const size_t element_size =
+                    get_format_size(context, &format, imageType, flag);
+                const size_t row_pitch = aligned_size(
+                    TEST_IMAGE_SIZE * element_size, row_pitch_alignment);
+                const size_t slice_pitch = aligned_size(
+                    row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment);
+                const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE;
+                cl_int err = CL_SUCCESS;
+                cl_mem buffer =
+                    clCreateBuffer(context, flag, buffer_size, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                image_desc.buffer = buffer;
+                if (imageType == CL_MEM_OBJECT_IMAGE2D
+                    || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                {
+                    image_desc.image_row_pitch = row_pitch;
+                }
+                else if (imageType == CL_MEM_OBJECT_IMAGE3D
+                         || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    image_desc.image_row_pitch = row_pitch;
+                    image_desc.image_slice_pitch = slice_pitch;
+                }
+                cl_mem image_buffer = clCreateImage(context, flag, &format,
+                                                    &image_desc, nullptr, &err);
+                test_error(err, "Unable to create image");
+                if (imageType == CL_MEM_OBJECT_IMAGE3D
+                    || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY
+                    || imageType == CL_MEM_OBJECT_IMAGE2D
+                    || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                {
+                    size_t returned_row_pitch = 0;
+                    err = clGetImageInfo(image_buffer, CL_IMAGE_ROW_PITCH,
+                                         sizeof(returned_row_pitch),
+                                         &returned_row_pitch, nullptr);
+                    test_error(err, "Error clGetImageInfo");
+                    if (returned_row_pitch != row_pitch)
+                    {
+                        test_fail(
+                            "Unexpected row pitch "
+                            "CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT");
+                    }
+                }
+                if (imageType == CL_MEM_OBJECT_IMAGE3D
+                    || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    size_t returned_slice_pitch = 0;
+                    err = clGetImageInfo(image_buffer, CL_IMAGE_SLICE_PITCH,
+                                         sizeof(returned_slice_pitch),
+                                         &returned_slice_pitch, nullptr);
+                    test_error(err, "Error clGetImageInfo");
+                    if (returned_slice_pitch != slice_pitch)
+                    {
+                        test_fail(
+                            "Unexpected row pitch "
+                    }
+                }
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Unable to release buffer");
+                err = clReleaseMemObject(image_buffer);
+                test_error(err, "Unable to release image");
+            }
+        }
+    }
+    return TEST_PASS;
+ * Negative testing for clCreateImage and wrong alignment
+ * - Create an image from a buffer with invalid row pitch (not a multiple of
+ * required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is returned.
+ * - Create an image from a buffer with invalid slice pitch (not a multiple of
+ * required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is returned.
+ * - Create an image from a buffer with invalid base address alignment (not a
+ * multiple of required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is
+ * returned
+ */
+int image_from_buffer_alignment_negative(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                size_t row_pitch_alignment = 0;
+                size_t slice_pitch_alignment = 0;
+                size_t base_address_alignment = 0;
+                int get_error = get_image_requirement_alignment(
+                    device, context, 0, &format, &image_desc,
+                    &row_pitch_alignment, &slice_pitch_alignment,
+                    &base_address_alignment);
+                if (TEST_PASS != get_error)
+                {
+                    return get_error;
+                }
+                const size_t element_size =
+                    get_format_size(context, &format, imageType, flag);
+                const size_t row_pitch = aligned_size(
+                    TEST_IMAGE_SIZE * element_size, row_pitch_alignment);
+                const size_t slice_pitch = aligned_size(
+                    row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment);
+                const size_t buffer_size = (slice_pitch + 1)
+                    * TEST_IMAGE_SIZE; /* For bigger row/slice pitch */
+                cl_int err = CL_SUCCESS;
+                cl_mem buffer =
+                    clCreateBuffer(context, flag, buffer_size, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                /* Test Row pitch images */
+                if (imageType == CL_MEM_OBJECT_IMAGE2D
+                    || imageType == CL_MEM_OBJECT_IMAGE3D
+                    || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY
+                    || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    image_desc.buffer = buffer;
+                    image_desc.image_row_pitch =
+                        row_pitch + 1; /* wrong row pitch */
+                    clCreateImage(context, flag, &format, &image_desc, nullptr,
+                                  &err);
+                    test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                                       "Unexpected clCreateImage return");
+                }
+                /* Test Slice pitch images */
+                if (imageType == CL_MEM_OBJECT_IMAGE3D
+                    || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    image_desc.buffer = buffer;
+                    image_desc.image_row_pitch = row_pitch;
+                    image_desc.image_slice_pitch =
+                        slice_pitch + 1; /* wrong slice pitch */
+                    clCreateImage(context, flag, &format, &image_desc, nullptr,
+                                  &err);
+                    test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                                       "Unexpected clCreateImage return");
+                }
+                /* Test buffer from host ptr to test base address alignment */
+                const size_t aligned_buffer_size =
+                    aligned_size(buffer_size, base_address_alignment);
+                /* Create buffer with host ptr and additional size for the wrong
+                 * alignment */
+                void* const host_ptr =
+                    malloc(aligned_buffer_size + base_address_alignment);
+                void* non_aligned_host_ptr =
+                    (void*)((char*)(aligned_ptr(host_ptr,
+                                                base_address_alignment))
+                            + 1); /* wrong alignment */
+                cl_mem buffer_host = clCreateBuffer(
+                    context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
+                    buffer_size, non_aligned_host_ptr, &err);
+                test_error(err, "Unable to create buffer");
+                image_desc.buffer = buffer_host;
+                clCreateImage(context, flag, &format, &image_desc, nullptr,
+                              &err);
+                test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                                   "Unexpected clCreateImage return");
+                free(host_ptr);
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Unable to release buffer");
+                err = clReleaseMemObject(buffer_host);
+                test_error(err, "Unable to release buffer");
+            }
+        }
+    }
+    return TEST_PASS;
+ * Negative testing for clCreateImage (buffer size).
+ * Create a buffer too small and check that image creation from that buffer is
+ * rejected
+ */
+int image_from_small_buffer_negative(cl_device_id device, cl_context context,
+                                     cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                /* Invalid buffer size */
+                cl_int err;
+                cl_mem buffer = clCreateBuffer(
+                    context, flag, TEST_IMAGE_SIZE / 2, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                image_desc.buffer = buffer;
+                clCreateImage(context, flag, &format, &image_desc, nullptr,
+                              &err);
+                test_failure_error(err, CL_INVALID_MEM_OBJECT,
+                                   "Unexpected clCreateImage return");
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Unable to release buffer");
+            }
+        }
+    }
+    return TEST_PASS;
+static int image_from_buffer_fill_check(cl_command_queue queue, cl_mem image,
+                                        size_t* region, size_t element_size,
+                                        char pattern)
+    /* read the image from buffer and check the pattern */
+    const size_t image_size = region[0] * region[1] * region[2] * element_size;
+    size_t origin[3] = { 0, 0, 0 };
+    std::vector<char> read_buffer(image_size);
+    cl_int error =
+        clEnqueueReadImage(queue, image, CL_BLOCKING, origin, region, 0, 0,
+                 , 0, nullptr, nullptr);
+    test_error(error, "Error clEnqueueReadImage");
+    for (size_t line = 0; line < region[0]; line++)
+    {
+        for (size_t row = 0; row < region[1]; row++)
+        {
+            for (size_t depth = 0; depth < region[2]; depth++)
+            {
+                for (size_t elmt = 0; elmt < element_size; elmt++)
+                {
+                    size_t index = line * row * depth * elmt;
+                    if (read_buffer[index] != pattern)
+                    {
+                        test_fail("Image pattern check failed");
+                    }
+                }
+            }
+        }
+    }
+    return TEST_PASS;
+ * Use fill buffer to fill the image from buffer
+ */
+int image_from_buffer_fill_positive(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                size_t row_pitch_alignment = 0;
+                size_t slice_pitch_alignment = 0;
+                int get_error = get_image_requirement_alignment(
+                    device, context, 0, &format, &image_desc,
+                    &row_pitch_alignment, &slice_pitch_alignment, nullptr);
+                if (TEST_PASS != get_error)
+                {
+                    return get_error;
+                }
+                const size_t element_size =
+                    get_format_size(context, &format, imageType, flag);
+                const size_t row_pitch = aligned_size(
+                    TEST_IMAGE_SIZE * element_size, row_pitch_alignment);
+                const size_t slice_pitch = aligned_size(
+                    row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment);
+                const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE;
+                cl_int err = CL_SUCCESS;
+                cl_mem buffer =
+                    clCreateBuffer(context, flag, buffer_size, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                /* fill the buffer with a pattern */
+                const char pattern = 0x55;
+                err = clEnqueueFillBuffer(queue, buffer, &pattern,
+                                          sizeof(pattern), 0, buffer_size, 0,
+                                          nullptr, nullptr);
+                test_error(err, "Error clEnqueueFillBuffer");
+                err = clFinish(queue);
+                test_error(err, "Error clFinish");
+                cl_mem image1d_buffer;
+                if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+                {
+                    image1d_buffer = clCreateBuffer(context, flag, buffer_size,
+                                                    nullptr, &err);
+                    test_error(err, "Unable to create buffer");
+                    image_desc.buffer = image1d_buffer;
+                }
+                cl_mem image = clCreateImage(context, flag, &format,
+                                             &image_desc, nullptr, &err);
+                test_error(err, "Unable to create image");
+                /* Check the image from buffer */
+                image_desc.buffer = buffer;
+                if (imageType == CL_MEM_OBJECT_IMAGE2D
+                    || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                {
+                    image_desc.image_row_pitch = row_pitch;
+                }
+                else if (imageType == CL_MEM_OBJECT_IMAGE3D
+                         || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    image_desc.image_row_pitch = row_pitch;
+                    image_desc.image_slice_pitch = slice_pitch;
+                }
+                cl_mem image_from_buffer = clCreateImage(
+                    context, flag, &format, &image_desc, nullptr, &err);
+                test_error(err, "Unable to create image");
+                size_t origin[3] = { 0, 0, 0 };
+                size_t region[3] = { 1, 1, 1 };
+                region[0] = TEST_IMAGE_SIZE;
+                if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
+                    && CL_MEM_OBJECT_IMAGE1D != imageType)
+                {
+                    region[1] = TEST_IMAGE_SIZE;
+                }
+                if (CL_MEM_OBJECT_IMAGE3D == imageType
+                    || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+                {
+                    region[2] = TEST_IMAGE_SIZE;
+                }
+                /* Check the copy of the image from buffer */
+                err =
+                    clEnqueueCopyImage(queue, image_from_buffer, image, origin,
+                                       origin, region, 0, nullptr, nullptr);
+                test_error(err, "Error clEnqueueCopyImage");
+                err = clFinish(queue);
+                test_error(err, "Error clFinish");
+                int fill_error = image_from_buffer_fill_check(
+                    queue, image_from_buffer, region, element_size, pattern);
+                if (TEST_PASS != fill_error)
+                {
+                    return fill_error;
+                }
+                fill_error = image_from_buffer_fill_check(
+                    queue, image, region, element_size, pattern);
+                if (TEST_PASS != fill_error)
+                {
+                    return fill_error;
+                }
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Unable to release buffer");
+                err = clReleaseMemObject(image);
+                test_error(err, "Unable to release image");
+                err = clReleaseMemObject(image_from_buffer);
+                test_error(err, "Unable to release image");
+                if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+                {
+                    err = clReleaseMemObject(image1d_buffer);
+                    test_error(err, "Unable to release image");
+                }
+            }
+        }
+    }
+    return TEST_PASS;
+static int image_from_buffer_read_check(cl_command_queue queue, cl_mem buffer,
+                                        const size_t buffer_size,
+                                        size_t* region, size_t element_size,
+                                        char pattern, size_t row_pitch,
+                                        size_t slice_pitch)
+    /* read the buffer and check the pattern */
+    std::vector<char> host_buffer(buffer_size);
+    char* host_ptr =;
+    char* host_ptr_slice = host_ptr;
+    cl_int error =
+        clEnqueueReadBuffer(queue, buffer, CL_BLOCKING, 0, buffer_size,
+                  , 0, nullptr, nullptr);
+    test_error(error, "Error clEnqueueReadBuffer");
+    for (size_t k = 0; k < region[2]; k++)
+    {
+        for (size_t i = 0; i < region[1]; i++)
+        {
+            for (size_t j = 0; j < region[0] * element_size; j++)
+            {
+                if (host_ptr[j] != pattern)
+                {
+                    test_fail("Image pattern check failed");
+                }
+            }
+            host_ptr = host_ptr + row_pitch;
+        }
+        host_ptr_slice = host_ptr_slice + slice_pitch;
+        host_ptr = host_ptr_slice;
+    }
+    return TEST_PASS;
+ * Use fill image to fill the buffer that was used to create the image
+ */
+int image_from_buffer_read_positive(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    for (auto imageType : imageTypes)
+    {
+        cl_image_desc image_desc = { 0 };
+        image_desc_init(&image_desc, imageType);
+        /* Non normalized format so we can read it back directly from
+         * clEnqueueFillImage */
+        cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT8 };
+        const char pattern = 0x55;
+        const size_t element_size =
+            get_format_size(context, &format, imageType, CL_MEM_READ_WRITE);
+        size_t row_pitch_alignment = 0;
+        size_t slice_pitch_alignment = 0;
+        int get_error = get_image_requirement_alignment(
+            device, context, CL_MEM_READ_WRITE, &format, &image_desc,
+            &row_pitch_alignment, &slice_pitch_alignment, nullptr);
+        if (TEST_PASS != get_error)
+        {
+            return get_error;
+        }
+        const size_t row_pitch =
+            aligned_size(TEST_IMAGE_SIZE * element_size, row_pitch_alignment);
+        const size_t slice_pitch =
+            aligned_size(row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment);
+        const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE;
+        cl_int err = CL_SUCCESS;
+        cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size,
+                                       nullptr, &err);
+        test_error(err, "Unable to create buffer");
+        /* Check the image from buffer */
+        image_desc.buffer = buffer;
+        if (imageType == CL_MEM_OBJECT_IMAGE2D
+            || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        {
+            image_desc.image_row_pitch = row_pitch;
+        }
+        else if (imageType == CL_MEM_OBJECT_IMAGE3D
+                 || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+        {
+            image_desc.image_row_pitch = row_pitch;
+            image_desc.image_slice_pitch = slice_pitch;
+        }
+        cl_mem image = clCreateImage(context, CL_MEM_READ_WRITE, &format,
+                                     &image_desc, nullptr, &err);
+        test_error(err, "Unable to create image");
+        size_t origin[3] = { 0, 0, 0 };
+        size_t region[3] = { 1, 1, 1 };
+        region[0] = TEST_IMAGE_SIZE;
+        if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
+            && CL_MEM_OBJECT_IMAGE1D != imageType)
+        {
+            region[1] = TEST_IMAGE_SIZE;
+        }
+        if (CL_MEM_OBJECT_IMAGE3D == imageType
+            || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType)
+        {
+            region[2] = TEST_IMAGE_SIZE;
+        }
+        /* fill the image with a pattern */
+        cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+        err = clEnqueueFillImage(queue, image, fill_color, origin, region, 0,
+                                 nullptr, nullptr);
+        test_error(err, "Error clEnqueueFillImage");
+        err = clFinish(queue);
+        test_error(err, "Error clFinish");
+        int read_error = image_from_buffer_read_check(
+            queue, buffer, buffer_size, region, element_size, pattern,
+            (imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? slice_pitch
+                                                       : row_pitch,
+            slice_pitch);
+        if (TEST_PASS != read_error)
+        {
+            return read_error;
+        }
+        err = clReleaseMemObject(buffer);
+        test_error(err, "Unable to release buffer");
+        err = clReleaseMemObject(image);
+        test_error(err, "Unable to release image");
+    }
+    return TEST_PASS;
\ No newline at end of file
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp
new file mode 100644
index 0000000..9212fcb
--- /dev/null
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp
@@ -0,0 +1,482 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "../testBase.h"
+#include "../common.h"
+#include "test_cl_ext_image_buffer.hpp"
+ * Negative tests for {CL_IMAGE_REQUIREMENTS_SIZE_EXT}
+ * Check that attempting to perform the {CL_IMAGE_REQUIREMENTS_SIZE_EXT} query
+ *  without specifying the _image_format_ results in {CL_INVALID_VALUE} being
+ * returned. Check that attempting to perform the
+ * {CL_IMAGE_REQUIREMENTS_SIZE_EXT} query without specifying the _image_desc_
+ * results in {CL_INVALID_VALUE} being returned.
+ */
+int cl_image_requirements_size_ext_negative(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    size_t max_size = 0;
+    size_t param_val_size = 0;
+    cl_image_desc image_desc = { 0 };
+    image_desc_init(&image_desc, CL_MEM_OBJECT_IMAGE2D);
+    cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 };
+    /* Check image_format null results in CL_INVALID_VALUE */
+    cl_int err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc,
+        CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size,
+        &param_val_size);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check image_desc null results in CL_INVALID_VALUE */
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, nullptr,
+        CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size,
+        &param_val_size);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    return TEST_PASS;
+ * Consistency checks for CL_IMAGE_REQUIREMENTS_SIZE_EXT
+ * When creating 2D images from a buffer is supported
+ * Check that the CL_IMAGE_REQUIREMENTS_SIZE_EXT query can be performed
+ * successfully. Create a buffer with the size returned and check that an image
+ * can successfully be created from the buffer. Check that the value returned
+ * for CL_MEM_SIZE for the image is the same as the value returned for
+ */
+int cl_image_requirements_size_ext_consistency(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    if (!is_extension_available(device, "cl_ext_image_from_buffer"))
+    {
+        printf("Extension cl_ext_image_from_buffer not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    size_t max_size = 0;
+    size_t param_val_size = 0;
+    std::vector<cl_mem_object_type> imageTypes{
+    };
+    std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY,
+                                         CL_MEM_READ_WRITE,
+                                         CL_MEM_KERNEL_READ_AND_WRITE };
+    for (auto flag : flagTypes)
+    {
+        for (auto imageType : imageTypes)
+        {
+            /* Get the list of supported image formats */
+            std::vector<cl_image_format> formatList;
+            if (TEST_PASS
+                    != get_format_list(context, imageType, formatList, flag)
+                || formatList.size() == 0)
+            {
+                test_fail("Failure to get supported formats list");
+            }
+            for (auto format : formatList)
+            {
+                cl_image_desc image_desc = { 0 };
+                image_desc_init(&image_desc, imageType);
+                flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE)
+                    ? CL_MEM_READ_WRITE
+                    : flag;
+                cl_int err = clGetImageRequirementsInfoEXT(
+                    context, nullptr, flag, &format, &image_desc,
+                    CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size,
+                    &param_val_size);
+                test_error(err, "Error clGetImageRequirementsInfoEXT");
+                /* Create buffer */
+                cl_mem buffer =
+                    clCreateBuffer(context, flag, max_size, nullptr, &err);
+                test_error(err, "Unable to create buffer");
+                image_desc.buffer = buffer;
+                /* 2D Image from buffer */
+                cl_mem image_buffer = clCreateImage(context, flag, &format,
+                                                    &image_desc, nullptr, &err);
+                test_error(err, "Unable to create image");
+                size_t size = 0;
+                err = clGetMemObjectInfo(image_buffer, CL_MEM_SIZE,
+                                         sizeof(size_t), &size, NULL);
+                test_error(err, "Error clGetMemObjectInfo");
+                if (max_size != size)
+                {
+                    test_fail("CL_IMAGE_REQUIREMENTS_SIZE_EXT different from "
+                              "CL_MEM_SIZE");
+                }
+                err = clReleaseMemObject(image_buffer);
+                test_error(err, "Error clReleaseMemObject");
+                err = clReleaseMemObject(buffer);
+                test_error(err, "Error clReleaseMemObject");
+            }
+        }
+    }
+    return TEST_PASS;
+ * Negative testing for all testable error codes returned by
+ * clGetImageFormatInfoKHR
+ */
+int clGetImageRequirementsInfoEXT_negative(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    cl_image_desc image_desc = { 0 };
+    image_desc_init(&image_desc, CL_MEM_OBJECT_IMAGE3D);
+    cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 };
+    /* Check that CL_INVALID_CONTEXT is returned when passing nullptr as context
+     */
+    size_t row_pitch_alignment = 0;
+    cl_int err = clGetImageRequirementsInfoEXT(
+        nullptr, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_CONTEXT,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_VALUE is returned when passing an invalid
+     * image_type */
+    cl_image_desc invalid_desc = { CL_MEM_OBJECT_BUFFER, TEST_IMAGE_SIZE };
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, &invalid_desc,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_VALUE is returned when passing invalid flags */
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, -1, &format, &image_desc,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_IMAGE_FORMAT_DESCRIPTOR is returned when passing a
+     * nullptr image_format */
+    cl_image_format invalid_format = { CL_INTENSITY, CL_UNORM_SHORT_555 };
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &invalid_format, &image_desc,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_IMAGE_DESCRIPTOR is returned when passing an
+     * image_desc with invalid values */
+    cl_image_desc invalid_desc_size = { CL_MEM_OBJECT_IMAGE1D, 0 };
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, &invalid_desc_size,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_VALUE is returned when passing an invalid
+     * param_name */
+    cl_image_requirements_info_ext invalid_info = CL_IMAGE_FORMAT;
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, invalid_info,
+        sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_VALUE is returned when passing a param_value_size
+     * value smaller than the size of the return type */
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+        sizeof(row_pitch_alignment) - 1, &row_pitch_alignment, nullptr);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    /* Check that CL_INVALID_VALUE is returned when passing a param_value_size
+     * value smaller than the size of the return type */
+    uint32_t max_height = 0;
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+        CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(max_height) - 1,
+        &max_height, nullptr);
+    test_failure_error(err, CL_INVALID_VALUE,
+                       "Unexpected clGetImageRequirementsInfoEXT return");
+    return TEST_PASS;
+ * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} query on all
+ * image types for which it is not valid Check that
+ * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases.
+ *
+ * Negative testing for {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT}
+ * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} query on all
+ * image types for which it is not valid Check that
+ * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases.
+ *
+ * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} query on
+ * all image types for which it is not valid Check that
+ * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases.
+ */
+int cl_image_requirements_max_val_ext_negative(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    size_t value = 0;
+    std::vector<cl_mem_object_type> imageTypes_height{
+    };
+    cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 };
+    for (auto imageType : imageTypes_height)
+    {
+        cl_image_desc image_desc = { 0 };
+        image_desc_init(&image_desc, imageType);
+        /* Check image_format null results in CL_INVALID_VALUE */
+        cl_int err = clGetImageRequirementsInfoEXT(
+            context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+            CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(value), &value,
+            nullptr);
+        test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR,
+                           "Unexpected clGetImageRequirementsInfoEXT return");
+    }
+    std::vector<cl_mem_object_type> imageTypes_depth{
+    };
+    for (auto imageType : imageTypes_depth)
+    {
+        cl_image_desc image_desc = { 0 };
+        image_desc_init(&image_desc, imageType);
+        /* Check image_format null results in CL_INVALID_VALUE */
+        cl_int err = clGetImageRequirementsInfoEXT(
+            context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+            CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, sizeof(value), &value,
+            nullptr);
+        test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR,
+                           "Unexpected clGetImageRequirementsInfoEXT return");
+    }
+    std::vector<cl_mem_object_type> imageTypes_array_size{
+    };
+    for (auto imageType : imageTypes_array_size)
+    {
+        cl_image_desc image_desc = { 0 };
+        image_desc_init(&image_desc, imageType);
+        /* Check image_format null results in CL_INVALID_VALUE */
+        cl_int err = clGetImageRequirementsInfoEXT(
+            context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc,
+            CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, sizeof(value), &value,
+            nullptr);
+        test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR,
+                           "Unexpected clGetImageRequirementsInfoEXT return");
+    }
+    return TEST_PASS;
+ * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT}
+ ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT} query can be performed
+ *successfully
+ *
+ * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT}
+ ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} query can be performed
+ *successfully
+ *
+ * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT}
+ ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} query can be performed
+ *successfully
+ *
+ ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} query can be
+ *performed successfully
+ */
+int cl_image_requirements_max_val_ext_positive(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue)
+    if (!is_extension_available(device, "cl_ext_image_requirements_info"))
+    {
+        printf("Extension cl_ext_image_requirements_info not available");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_platform_id platform = getPlatformFromDevice(device);
+    GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT);
+    cl_image_desc image_desc_1d = { 0 };
+    image_desc_init(&image_desc_1d, CL_MEM_OBJECT_IMAGE1D);
+    uint32_t max_width = 0;
+    cl_int err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_1d,
+        CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT, sizeof(max_width), &max_width,
+        nullptr);
+    test_error(err, "Error clGetImageRequirementsInfoEXT");
+    size_t width_1d = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                          sizeof(width_1d), &width_1d, NULL);
+    test_error(err, "Error clGetDeviceInfo");
+    if (!(max_width <= width_1d && max_width > 0))
+    {
+        test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT value");
+    }
+    cl_image_desc image_desc_2d = { 0 };
+    image_desc_init(&image_desc_2d, CL_MEM_OBJECT_IMAGE2D);
+    uint32_t max_height = 0;
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_2d,
+        CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(max_height), &max_height,
+        nullptr);
+    test_error(err, "Error clGetImageRequirementsInfoEXT");
+    size_t height_2d = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                          sizeof(height_2d), &height_2d, NULL);
+    test_error(err, "Error clGetDeviceInfo");
+    if (!(max_height <= height_2d && max_height > 0))
+    {
+        test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT value");
+    }
+    cl_image_desc image_desc_3d = { 0 };
+    image_desc_init(&image_desc_3d, CL_MEM_OBJECT_IMAGE3D);
+    uint32_t max_depth = 0;
+    err = clGetImageRequirementsInfoEXT(context, nullptr, CL_MEM_READ_WRITE,
+                                        nullptr, &image_desc_3d,
+                                        CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT,
+                                        sizeof(max_depth), &max_depth, nullptr);
+    test_error(err, "Error clGetImageRequirementsInfoEXT");
+    size_t depth_3d = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(depth_3d),
+                          &depth_3d, NULL);
+    test_error(err, "Error clGetDeviceInfo");
+    if (!(max_depth <= depth_3d && max_depth > 0))
+    {
+        test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT value");
+    }
+    cl_image_desc image_desc_array = { 0 };
+    image_desc_init(&image_desc_array, CL_MEM_OBJECT_IMAGE2D_ARRAY);
+    uint32_t max_array_size = 0;
+    err = clGetImageRequirementsInfoEXT(
+        context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_array,
+        CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, sizeof(max_array_size),
+        &max_array_size, nullptr);
+    test_error(err, "Error clGetImageRequirementsInfoEXT");
+    size_t array_size = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE,
+                          sizeof(array_size), &array_size, NULL);
+    test_error(err, "Error clGetDeviceInfo");
+    if (!(max_array_size <= array_size && max_array_size > 0))
+    {
+        test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT value");
+    }
+    return TEST_PASS;
\ No newline at end of file
diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp
index e76710b..a22db19 100644
--- a/test_conformance/images/kernel_read_write/test_common.cpp
+++ b/test_conformance/images/kernel_read_write/test_common.cpp
@@ -16,6 +16,7 @@
 #include "test_common.h"
+#include <algorithm>
 cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool test_mipmaps, cl_int *error) {
     cl_sampler sampler = nullptr;
@@ -33,122 +34,210 @@
     return sampler;
-void InitFloatCoordsCommon(image_descriptor *imageInfo,
-                           image_sampler_data *imageSampler, float *xOffsets,
-                           float *yOffsets, float *zOffsets, float xfract,
-                           float yfract, float zfract, int normalized_coords,
-                           MTdata d, int lod)
+bool get_image_dimensions(image_descriptor *imageInfo, size_t &width,
+                          size_t &height, size_t &depth)
+    width = imageInfo->width;
+    height = 1;
+    depth = 1;
+    switch (imageInfo->type)
+    {
+        case CL_MEM_OBJECT_IMAGE1D: break;
+        case CL_MEM_OBJECT_IMAGE1D_ARRAY: height = imageInfo->arraySize; break;
+        case CL_MEM_OBJECT_IMAGE2D: height = imageInfo->height; break;
+            height = imageInfo->height;
+            depth = imageInfo->arraySize;
+            break;
+        case CL_MEM_OBJECT_IMAGE3D:
+            height = imageInfo->height;
+            depth = imageInfo->depth;
+            break;
+        default:
+            log_error("ERROR: Test does not support image type");
+            return TEST_FAIL;
+    }
+    return 0;
+static bool InitFloatCoordsCommon(image_descriptor *imageInfo,
+                                  image_sampler_data *imageSampler,
+                                  float *xOffsets, float *yOffsets,
+                                  float *zOffsets, float xfract, float yfract,
+                                  float zfract, int normalized_coords, MTdata d,
+                                  int lod)
     size_t i = 0;
-    if (gDisableOffsets)
+    size_t width_loop, height_loop, depth_loop;
+    bool error =
+        get_image_dimensions(imageInfo, width_loop, height_loop, depth_loop);
+    if (!error)
-        for (size_t z = 0; z < imageInfo->depth; z++)
+        if (gDisableOffsets)
-            for (size_t y = 0; y < imageInfo->height; y++)
+            for (size_t z = 0; z < depth_loop; z++)
-                for (size_t x = 0; x < imageInfo->width; x++, i++)
+                for (size_t y = 0; y < height_loop; y++)
-                    xOffsets[i] = (float)(xfract + (double)x);
-                    yOffsets[i] = (float)(yfract + (double)y);
-                    zOffsets[i] = (float)(zfract + (double)z);
-                }
-            }
-        }
-    }
-    else
-    {
-        for (size_t z = 0; z < imageInfo->depth; z++)
-        {
-            for (size_t y = 0; y < imageInfo->height; y++)
-            {
-                for (size_t x = 0; x < imageInfo->width; x++, i++)
-                {
-                    xOffsets[i] =
-                        (float)(xfract
-                                + (double)((int)x
-                                           + random_in_range(-10, 10, d)));
-                    yOffsets[i] =
-                        (float)(yfract
-                                + (double)((int)y
-                                           + random_in_range(-10, 10, d)));
-                    zOffsets[i] =
-                        (float)(zfract
-                                + (double)((int)z
-                                           + random_in_range(-10, 10, d)));
-                }
-            }
-        }
-    }
-    if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
-    {
-        i = 0;
-        for (size_t z = 0; z < imageInfo->depth; z++)
-        {
-            for (size_t y = 0; y < imageInfo->height; y++)
-            {
-                for (size_t x = 0; x < imageInfo->width; x++, i++)
-                {
-                    xOffsets[i] = (float)CLAMP((double)xOffsets[i], 0.0,
-                                               (double)imageInfo->width - 1.0);
-                    yOffsets[i] = (float)CLAMP((double)yOffsets[i], 0.0,
-                                               (double)imageInfo->height - 1.0);
-                    zOffsets[i] = (float)CLAMP((double)zOffsets[i], 0.0,
-                                               (double)imageInfo->depth - 1.0);
-                }
-            }
-        }
-    }
-    if (normalized_coords || gTestMipmaps)
-    {
-        i = 0;
-        if (lod == 0)
-        {
-            for (size_t z = 0; z < imageInfo->depth; z++)
-            {
-                for (size_t y = 0; y < imageInfo->height; y++)
-                {
-                    for (size_t x = 0; x < imageInfo->width; x++, i++)
+                    for (size_t x = 0; x < width_loop; x++, i++)
-                        xOffsets[i] = (float)((double)xOffsets[i]
-                                              / (double)imageInfo->width);
-                        yOffsets[i] = (float)((double)yOffsets[i]
-                                              / (double)imageInfo->height);
-                        zOffsets[i] = (float)((double)zOffsets[i]
-                                              / (double)imageInfo->depth);
+                        xOffsets[i] = (float)(xfract + (double)x);
+                        yOffsets[i] = (float)(yfract + (double)y);
+                        zOffsets[i] = (float)(zfract + (double)z);
-        else if (gTestMipmaps)
+        else
-            size_t width_lod, height_lod, depth_lod;
-            width_lod =
-                (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
-            height_lod =
-                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
-            depth_lod =
-                (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
-            for (size_t z = 0; z < depth_lod; z++)
+            for (size_t z = 0; z < depth_loop; z++)
-                for (size_t y = 0; y < height_lod; y++)
+                for (size_t y = 0; y < height_loop; y++)
-                    for (size_t x = 0; x < width_lod; x++, i++)
+                    for (size_t x = 0; x < width_loop; x++, i++)
                         xOffsets[i] =
-                            (float)((double)xOffsets[i] / (double)width_lod);
+                            (float)(xfract
+                                    + (double)((int)x
+                                               + random_in_range(-10, 10, d)));
                         yOffsets[i] =
-                            (float)((double)yOffsets[i] / (double)height_lod);
+                            (float)(yfract
+                                    + (double)((int)y
+                                               + random_in_range(-10, 10, d)));
                         zOffsets[i] =
-                            (float)((double)zOffsets[i] / (double)depth_lod);
+                            (float)(zfract
+                                    + (double)((int)z
+                                               + random_in_range(-10, 10, d)));
+                    }
+                }
+            }
+        }
+        if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
+        {
+            i = 0;
+            for (size_t z = 0; z < depth_loop; z++)
+            {
+                for (size_t y = 0; y < height_loop; y++)
+                {
+                    for (size_t x = 0; x < width_loop; x++, i++)
+                    {
+                        xOffsets[i] = (float)CLAMP((double)xOffsets[i], 0.0,
+                                                   (double)width_loop - 1.0);
+                        yOffsets[i] = (float)CLAMP((double)yOffsets[i], 0.0,
+                                                   (double)height_loop - 1.0);
+                        zOffsets[i] = (float)CLAMP((double)zOffsets[i], 0.0,
+                                                   (double)depth_loop - 1.0);
+                    }
+                }
+            }
+        }
+        if (normalized_coords || gTestMipmaps)
+        {
+            i = 0;
+            if (lod == 0)
+            {
+                for (size_t z = 0; z < depth_loop; z++)
+                {
+                    for (size_t y = 0; y < height_loop; y++)
+                    {
+                        for (size_t x = 0; x < width_loop; x++, i++)
+                        {
+                            xOffsets[i] = (float)((double)xOffsets[i]
+                                                  / (double)width_loop);
+                            if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                            {
+                                yOffsets[i] = (float)((double)yOffsets[i]
+                                                      / (double)height_loop);
+                            }
+                            if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                            {
+                                zOffsets[i] = (float)((double)zOffsets[i]
+                                                      / (double)depth_loop);
+                            }
+                        }
+                    }
+                }
+            }
+            else if (gTestMipmaps)
+            {
+                size_t width_lod =
+                    (width_loop >> lod) ? (width_loop >> lod) : 1;
+                size_t height_lod = height_loop;
+                size_t depth_lod = depth_loop;
+                if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                {
+                    height_lod =
+                        (height_loop >> lod) ? (height_loop >> lod) : 1;
+                }
+                if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                {
+                    depth_lod = (depth_loop >> lod) ? (depth_loop >> lod) : 1;
+                }
+                for (size_t z = 0; z < depth_lod; z++)
+                {
+                    for (size_t y = 0; y < height_lod; y++)
+                    {
+                        for (size_t x = 0; x < width_lod; x++, i++)
+                        {
+                            xOffsets[i] = (float)((double)xOffsets[i]
+                                                  / (double)width_lod);
+                            if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
+                            {
+                                yOffsets[i] = (float)((double)yOffsets[i]
+                                                      / (double)height_lod);
+                            }
+                            if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
+                            {
+                                zOffsets[i] = (float)((double)zOffsets[i]
+                                                      / (double)depth_lod);
+                            }
+                        }
+    return error;
+cl_mem create_image_of_type(cl_context context, cl_mem_flags mem_flags,
+                            image_descriptor *imageInfo, size_t row_pitch,
+                            size_t slice_pitch, void *host_ptr, cl_int *error)
+    cl_mem image;
+    switch (imageInfo->type)
+    {
+        case CL_MEM_OBJECT_IMAGE3D:
+            image = create_image_3d(context, mem_flags, imageInfo->format,
+                                    imageInfo->width, imageInfo->height,
+                                    imageInfo->depth, row_pitch, slice_pitch,
+                                    host_ptr, error);
+            break;
+        default:
+            log_error("Implementation is incomplete, only 3D images are "
+                      "supported so far");
+            return nullptr;
+    }
+    return image;
+static size_t get_image_num_pixels(image_descriptor *imageInfo, size_t width,
+                                   size_t height, size_t depth,
+                                   size_t array_size)
+    size_t image_size;
+    switch (imageInfo->type)
+    {
+        case CL_MEM_OBJECT_IMAGE3D: image_size = width * height * depth; break;
+        default:
+            log_error("Implementation is incomplete, only 3D images are "
+                      "supported so far");
+            return 0;
+    }
+    return image_size;
 int test_read_image(cl_context context, cl_command_queue queue,
@@ -160,6 +249,17 @@
     size_t threads[3];
     static int initHalf = 0;
+    size_t image_size =
+        get_image_num_pixels(imageInfo, imageInfo->width, imageInfo->height,
+                             imageInfo->depth, imageInfo->arraySize);
+    test_assert_error(0 != image_size, "Invalid image size");
+    size_t width_size, height_size, depth_size;
+    if (get_image_dimensions(imageInfo, width_size, height_size, depth_size))
+    {
+        log_error("ERROR: invalid image dimensions");
+        return CL_INVALID_VALUE;
+    }
     cl_mem_flags image_read_write_flags = CL_MEM_READ_ONLY;
     clMemWrapper xOffsets, yOffsets, zOffsets, results;
@@ -168,14 +268,11 @@
     // Create offset data
     BufferOwningPtr<cl_float> xOffsetValues(
-        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
-               * imageInfo->depth));
+        malloc(sizeof(cl_float) * image_size));
     BufferOwningPtr<cl_float> yOffsetValues(
-        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
-               * imageInfo->depth));
+        malloc(sizeof(cl_float) * image_size));
     BufferOwningPtr<cl_float> zOffsetValues(
-        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
-               * imageInfo->depth));
+        malloc(sizeof(cl_float) * image_size));
     if (imageInfo->format->image_channel_data_type == CL_HALF_FLOAT)
         if (DetectFloatToHalfRoundingMode(queue)) return 1;
@@ -206,26 +303,27 @@
                                        maxImageUseHostPtrBackingStore, d);
-            unprotImage = create_image_3d(
+            unprotImage = create_image_of_type(
                 context, image_read_write_flags | CL_MEM_USE_HOST_PTR,
-                imageInfo->format, imageInfo->width, imageInfo->height,
-                imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+                imageInfo, (gEnablePitch ? imageInfo->rowPitch : 0),
                 (gEnablePitch ? imageInfo->slicePitch : 0),
                 maxImageUseHostPtrBackingStore, &error);
-            error = protImage.Create(context, image_read_write_flags,
-                                     imageInfo->format, imageInfo->width,
-                                     imageInfo->height, imageInfo->depth);
+            error = protImage.Create(context, imageInfo->type,
+                                     image_read_write_flags, imageInfo->format,
+                                     imageInfo->width, imageInfo->height,
+                                     imageInfo->depth, imageInfo->arraySize);
         if (error != CL_SUCCESS)
-            log_error("ERROR: Unable to create 3D image of size %d x %d x %d "
+            log_error("ERROR: Unable to create image of size %d x %d x %d x %d "
                       "(pitch %d, %d ) (%s)",
                       (int)imageInfo->width, (int)imageInfo->height,
-                      (int)imageInfo->depth, (int)imageInfo->rowPitch,
-                      (int)imageInfo->slicePitch, IGetErrorString(error));
+                      (int)imageInfo->depth, (int)imageInfo->arraySize,
+                      (int)imageInfo->rowPitch, (int)imageInfo->slicePitch,
+                      IGetErrorString(error));
             return error;
         if (gTestMaxImages)
@@ -237,18 +335,18 @@
         // Don't use clEnqueueWriteImage; just use copy host ptr to get the data
         // in
-        unprotImage = create_image_3d(
-            context, image_read_write_flags | CL_MEM_COPY_HOST_PTR,
-            imageInfo->format, imageInfo->width, imageInfo->height,
-            imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+        unprotImage = create_image_of_type(
+            context, image_read_write_flags | CL_MEM_COPY_HOST_PTR, imageInfo,
+            (gEnablePitch ? imageInfo->rowPitch : 0),
             (gEnablePitch ? imageInfo->slicePitch : 0), imageValues, &error);
         if (error != CL_SUCCESS)
-            log_error("ERROR: Unable to create 3D image of size %d x %d x %d "
+            log_error("ERROR: Unable to create image of size %d x %d x %d x %d "
                       "(pitch %d, %d ) (%s)",
                       (int)imageInfo->width, (int)imageInfo->height,
-                      (int)imageInfo->depth, (int)imageInfo->rowPitch,
-                      (int)imageInfo->slicePitch, IGetErrorString(error));
+                      (int)imageInfo->depth, (int)imageInfo->arraySize,
+                      (int)imageInfo->rowPitch, (int)imageInfo->slicePitch,
+                      IGetErrorString(error));
             return error;
         image = unprotImage;
@@ -260,19 +358,19 @@
         // specified, so we just do the same thing either way
         if (!gTestMipmaps)
-            unprotImage = create_image_3d(
-                context, image_read_write_flags | gMemFlagsToUse,
-                imageInfo->format, imageInfo->width, imageInfo->height,
-                imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+            unprotImage = create_image_of_type(
+                context, image_read_write_flags | gMemFlagsToUse, imageInfo,
+                (gEnablePitch ? imageInfo->rowPitch : 0),
                 (gEnablePitch ? imageInfo->slicePitch : 0), imageValues,
             if (error != CL_SUCCESS)
-                log_error("ERROR: Unable to create 3D image of size %d x %d x "
-                          "%d (pitch %d, %d ) (%s)",
+                log_error("ERROR: Unable to create image of size %d x %d x "
+                          "%d x %d (pitch %d, %d ) (%s)",
                           (int)imageInfo->width, (int)imageInfo->height,
-                          (int)imageInfo->depth, (int)imageInfo->rowPitch,
-                          (int)imageInfo->slicePitch, IGetErrorString(error));
+                          (int)imageInfo->depth, (int)imageInfo->arraySize,
+                          (int)imageInfo->rowPitch, (int)imageInfo->slicePitch,
+                          IGetErrorString(error));
                 return error;
             image = unprotImage;
@@ -280,10 +378,11 @@
             cl_image_desc image_desc = { 0 };
-            image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            image_desc.image_type = imageInfo->type;
             image_desc.image_width = imageInfo->width;
             image_desc.image_height = imageInfo->height;
             image_desc.image_depth = imageInfo->depth;
+            image_desc.image_array_size = imageInfo->arraySize;
             image_desc.num_mip_levels = imageInfo->num_mip_levels;
@@ -292,23 +391,24 @@
                               imageInfo->format, &image_desc, NULL, &error);
             if (error != CL_SUCCESS)
-                log_error("ERROR: Unable to create %d level mipmapped 3D image "
-                          "of size %d x %d x %d (pitch %d, %d ) (%s)",
+                log_error("ERROR: Unable to create %d level mipmapped image "
+                          "of size %d x %d x %d x %d (pitch %d, %d ) (%s)",
                           (int)imageInfo->num_mip_levels, (int)imageInfo->width,
                           (int)imageInfo->height, (int)imageInfo->depth,
-                          (int)imageInfo->rowPitch, (int)imageInfo->slicePitch,
-                          IGetErrorString(error));
+                          (int)imageInfo->arraySize, (int)imageInfo->rowPitch,
+                          (int)imageInfo->slicePitch, IGetErrorString(error));
                 return error;
             image = unprotImage;
+    test_assert_error(nullptr != image, "Image creation failed");
     if (gMemFlagsToUse != CL_MEM_COPY_HOST_PTR)
         size_t origin[4] = { 0, 0, 0, 0 };
-        size_t region[3] = { imageInfo->width, imageInfo->height,
-                             imageInfo->depth };
+        size_t region[3] = { width_size, height_size, depth_size };
         if (gDebugTrace) log_info(" - Writing image...\n");
@@ -323,10 +423,10 @@
             if (error != CL_SUCCESS)
-                log_error("ERROR: Unable to write to 3D image of size %d x %d "
-                          "x %d \n",
+                log_error("ERROR: Unable to write to image of size %d x %d "
+                          "x %d x %d\n",
                           (int)imageInfo->width, (int)imageInfo->height,
-                          (int)imageInfo->depth);
+                          (int)imageInfo->depth, (int)imageInfo->arraySize);
                 return error;
@@ -338,17 +438,15 @@
                 origin[3] = i;
                 error = clEnqueueWriteImage(
-                    queue, image, CL_TRUE, origin, region,
-                    /*gEnablePitch ? imageInfo->rowPitch :*/ 0,
-                    /*gEnablePitch ? imageInfo->slicePitch :*/ 0,
+                    queue, image, CL_TRUE, origin, region, 0, 0,
                     ((char *)imageValues + nextLevelOffset), 0, NULL, NULL);
                 if (error != CL_SUCCESS)
-                    log_error("ERROR: Unable to write to %d level mipmapped 3D "
-                              "image of size %d x %d x %d\n",
+                    log_error("ERROR: Unable to write to %d level mipmapped "
+                              "image of size %d x %d x %d x %d\n",
                               (int)imageInfo->width, (int)imageInfo->height,
-                              (int)imageInfo->depth);
+                              (int)imageInfo->arraySize, (int)imageInfo->depth);
                     return error;
                 nextLevelOffset += region[0] * region[1] * region[2]
@@ -361,26 +459,21 @@
-    xOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              xOffsetValues, &error);
+    xOffsets =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       sizeof(cl_float) * image_size, xOffsetValues, &error);
     test_error(error, "Unable to create x offset buffer");
-    yOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              yOffsetValues, &error);
+    yOffsets =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       sizeof(cl_float) * image_size, yOffsetValues, &error);
     test_error(error, "Unable to create y offset buffer");
-    zOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              zOffsetValues, &error);
+    zOffsets =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       sizeof(cl_float) * image_size, zOffsetValues, &error);
     test_error(error, "Unable to create y offset buffer");
-    results =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       get_explicit_type_size(outputType) * 4 * imageInfo->width
-                           * imageInfo->height * imageInfo->depth,
-                       NULL, &error);
+    results = clCreateBuffer(
+        context, CL_MEM_READ_WRITE,
+        get_explicit_type_size(outputType) * 4 * image_size, NULL, &error);
     test_error(error, "Unable to create result buffer");
     // Create sampler to use
@@ -443,16 +536,19 @@
     int nextLevelOffset = 0;
-    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
-           depth_lod = imageInfo->depth;
+    size_t width_lod = width_size, height_lod = height_size,
+           depth_lod = depth_size;
     // Loop over all mipmap levels, if we are testing mipmapped images.
     for (int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels)
          || (!gTestMipmaps && lod < 1);
-        size_t resultValuesSize = width_lod * height_lod * depth_lod
-            * get_explicit_type_size(outputType) * 4;
+        size_t image_lod_size = get_image_num_pixels(
+            imageInfo, width_lod, height_lod, depth_lod, imageInfo->arraySize);
+        test_assert_error(0 != image_lod_size, "Invalid image size");
+        size_t resultValuesSize =
+            image_lod_size * get_explicit_type_size(outputType) * 4;
         BufferOwningPtr<char> resultValues(malloc(resultValuesSize));
         float lod_float = (float)lod;
         if (gTestMipmaps)
@@ -468,30 +564,25 @@
             float offset = float_offsets[q % float_offset_count];
             // Init the coordinates
-            InitFloatCoordsCommon(imageInfo, imageSampler, xOffsetValues,
-                                  yOffsetValues, zOffsetValues,
-                                  q >= float_offset_count ? -offset : offset,
-                                  q >= float_offset_count ? offset : -offset,
-                                  q >= float_offset_count ? -offset : offset,
-                                  imageSampler->normalized_coords, d, lod);
+            error = InitFloatCoordsCommon(
+                imageInfo, imageSampler, xOffsetValues, yOffsetValues,
+                zOffsetValues, q >= float_offset_count ? -offset : offset,
+                q >= float_offset_count ? offset : -offset,
+                q >= float_offset_count ? -offset : offset,
+                imageSampler->normalized_coords, d, lod);
+            test_error(error, "Unable to initialise coordinates");
-            error =
-                clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0,
-                                     sizeof(cl_float) * imageInfo->height
-                                         * imageInfo->width * imageInfo->depth,
-                                     xOffsetValues, 0, NULL, NULL);
+            error = clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0,
+                                         sizeof(cl_float) * image_size,
+                                         xOffsetValues, 0, NULL, NULL);
             test_error(error, "Unable to write x offsets");
-            error =
-                clEnqueueWriteBuffer(queue, yOffsets, CL_TRUE, 0,
-                                     sizeof(cl_float) * imageInfo->height
-                                         * imageInfo->width * imageInfo->depth,
-                                     yOffsetValues, 0, NULL, NULL);
+            error = clEnqueueWriteBuffer(queue, yOffsets, CL_TRUE, 0,
+                                         sizeof(cl_float) * image_size,
+                                         yOffsetValues, 0, NULL, NULL);
             test_error(error, "Unable to write y offsets");
-            error =
-                clEnqueueWriteBuffer(queue, zOffsets, CL_TRUE, 0,
-                                     sizeof(cl_float) * imageInfo->height
-                                         * imageInfo->width * imageInfo->depth,
-                                     zOffsetValues, 0, NULL, NULL);
+            error = clEnqueueWriteBuffer(queue, zOffsets, CL_TRUE, 0,
+                                         sizeof(cl_float) * image_size,
+                                         zOffsetValues, 0, NULL, NULL);
             test_error(error, "Unable to write z offsets");
@@ -510,11 +601,10 @@
             test_error(error, "Unable to run kernel");
             // Get results
-            error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0,
-                                        width_lod * height_lod * depth_lod
-                                            * get_explicit_type_size(outputType)
-                                            * 4,
-                                        resultValues, 0, NULL, NULL);
+            error = clEnqueueReadBuffer(
+                queue, results, CL_TRUE, 0,
+                image_lod_size * get_explicit_type_size(outputType) * 4,
+                resultValues, 0, NULL, NULL);
             test_error(error, "Unable to read results from kernel");
             if (gDebugTrace) log_info("    results read\n");
@@ -556,7 +646,7 @@
                                 // Apple requires its CPU implementation to do
                                 // correctly rounded address arithmetic in all
                                 // modes
-                                || gDeviceType != CL_DEVICE_TYPE_GPU
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
                                 offset = 0.0f; // Loop only once
@@ -874,7 +964,7 @@
                                 // Apple requires its CPU implementation to do
                                 // correctly rounded address arithmetic in all
                                 // modes
-                                || gDeviceType != CL_DEVICE_TYPE_GPU
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
                                 offset = 0.0f; // Loop only once
@@ -934,13 +1024,13 @@
                                             err4 = 0.0f;
-                                        float maxErr1 = MAX(
+                                        float maxErr1 = std::max(
                                             maxErr * maxPixel.p[0], FLT_MIN);
-                                        float maxErr2 = MAX(
+                                        float maxErr2 = std::max(
                                             maxErr * maxPixel.p[1], FLT_MIN);
-                                        float maxErr3 = MAX(
+                                        float maxErr3 = std::max(
                                             maxErr * maxPixel.p[2], FLT_MIN);
-                                        float maxErr4 = MAX(
+                                        float maxErr4 = std::max(
                                             maxErr * maxPixel.p[3], FLT_MIN);
                                         if (!(err1 <= maxErr1)
@@ -1039,17 +1129,17 @@
                                             float err4 = ABS_ERROR(resultPtr[3],
                                             float maxErr1 =
-                                                MAX(maxErr * maxPixel.p[0],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[0],
+                                                         FLT_MIN);
                                             float maxErr2 =
-                                                MAX(maxErr * maxPixel.p[1],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[1],
+                                                         FLT_MIN);
                                             float maxErr3 =
-                                                MAX(maxErr * maxPixel.p[2],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[2],
+                                                         FLT_MIN);
                                             float maxErr4 =
-                                                MAX(maxErr * maxPixel.p[3],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[3],
+                                                         FLT_MIN);
                                             if (!(err1 <= maxErr1)
@@ -1213,7 +1303,8 @@
                                         // offsets (0.0, 0.0) E.g., test one
                                         // pixel.
                                         if (!imageSampler->normalized_coords
-                                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                                            || !(gDeviceType
+                                                 & CL_DEVICE_TYPE_GPU)
                                             || NORM_OFFSET == 0)
                                             norm_offset_x = 0.0f;
@@ -1395,7 +1486,8 @@
                                         // offsets (0.0, 0.0) E.g., test one
                                         // pixel.
                                         if (!imageSampler->normalized_coords
-                                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                                            || !(gDeviceType
+                                                 & CL_DEVICE_TYPE_GPU)
                                             || NORM_OFFSET == 0)
                                             norm_offset_x = 0.0f;
@@ -1537,10 +1629,51 @@
             nextLevelOffset += width_lod * height_lod * depth_lod
                 * get_pixel_size(imageInfo->format);
             width_lod = (width_lod >> 1) ? (width_lod >> 1) : 1;
-            height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1;
-            depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1;
+            if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
+            {
+                height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1;
+            }
+            if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
+            {
+                depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1;
+            }
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
\ No newline at end of file
+void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr)
+    // mask off the top bit (bit 15) if the image format is (CL_UNORM_SHORT_555,
+    // CL_RGB). (Note: OpenCL says: the top bit is undefined meaning it can be
+    // either 0 or 1.)
+    if (imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555)
+    {
+        cl_ushort *temp = (cl_ushort *)resultPtr;
+        temp[0] &= 0x7fff;
+    }
+int filter_rounding_errors(int forceCorrectlyRoundedWrites,
+                           image_descriptor *imageInfo, float *errors)
+    // We are allowed 0.6 absolute error vs. infinitely precise for some
+    // normalized formats
+    if (0 == forceCorrectlyRoundedWrites
+        && (imageInfo->format->image_channel_data_type == CL_UNORM_INT8
+            || imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010
+            || imageInfo->format->image_channel_data_type == CL_UNORM_INT16
+            || imageInfo->format->image_channel_data_type == CL_SNORM_INT8
+            || imageInfo->format->image_channel_data_type == CL_SNORM_INT16
+            || imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555
+            || imageInfo->format->image_channel_data_type
+                == CL_UNORM_SHORT_565))
+    {
+        if (!(fabsf(errors[0]) > 0.6f) && !(fabsf(errors[1]) > 0.6f)
+            && !(fabsf(errors[2]) > 0.6f) && !(fabsf(errors[3]) > 0.6f))
+            return 0;
+    }
+    return 1;
diff --git a/test_conformance/images/kernel_read_write/test_common.h b/test_conformance/images/kernel_read_write/test_common.h
index e7ecbe0..fc95bee 100644
--- a/test_conformance/images/kernel_read_write/test_common.h
+++ b/test_conformance/images/kernel_read_write/test_common.h
@@ -42,12 +42,8 @@
                            bool useFloatCoords, ExplicitType outputType,
                            MTdata d);
-extern void InitFloatCoordsCommon(image_descriptor *imageInfo,
-                                  image_sampler_data *imageSampler,
-                                  float *xOffsets, float *yOffsets,
-                                  float *zOffsets, float xfract, float yfract,
-                                  float zfract, int normalized_coords, MTdata d,
-                                  int lod);
+extern bool get_image_dimensions(image_descriptor *imageInfo, size_t &width,
+                                 size_t &height, size_t &depth);
 template <class T>
 int determine_validation_error_offset(
@@ -63,8 +59,12 @@
     bool clampingErr = false, clamped = false, otherClampingBug = false;
     int clampedX, clampedY, clampedZ;
-    size_t imageWidth = imageInfo->width, imageHeight = imageInfo->height,
-           imageDepth = imageInfo->depth;
+    size_t imageWidth, imageHeight, imageDepth;
+    if (get_image_dimensions(imageInfo, imageWidth, imageHeight, imageDepth))
+    {
+        log_error("ERROR: invalid image dimensions");
+        return TEST_FAIL;
+    }
     clamped = get_integer_coords_offset(x, y, z, xAddressOffset, yAddressOffset,
                                         zAddressOffset, imageWidth, imageHeight,
@@ -147,85 +147,75 @@
     if (!clampingErr)
-        /*        if( clamped && ( (int)x + (int)xOffsetValues[ j ] < 0 ||
-         (int)y + (int)yOffsetValues[ j ] < 0 ) )
-         {
-         log_error( "NEGATIVE COORDINATE ERROR\n" );
-         return -1;
-         }
-         */
-        if (true) // gExtraValidateInfo )
+        if (printAsFloat)
-            if (printAsFloat)
-            {
-                log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
-                          "validate!\n\tExpected (%g,%g,%g,%g),\n\t     got "
-                          "(%g,%g,%g,%g), error of %g\n",
-                          j, x, x, y, y, z, z, (float)expected[0],
-                          (float)expected[1], (float)expected[2],
-                          (float)expected[3], (float)resultPtr[0],
-                          (float)resultPtr[1], (float)resultPtr[2],
-                          (float)resultPtr[3], error);
-            }
-            else
-            {
-                log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
-                          "validate!\n\tExpected (%x,%x,%x,%x),\n\t     got "
-                          "(%x,%x,%x,%x)\n",
-                          j, x, x, y, y, z, z, (int)expected[0],
-                          (int)expected[1], (int)expected[2], (int)expected[3],
-                          (int)resultPtr[0], (int)resultPtr[1],
-                          (int)resultPtr[2], (int)resultPtr[3]);
-            }
-            log_error(
-                "Integer coords resolve to %d,%d,%d   with img size %d,%d,%d\n",
-                clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight,
-                (int)imageDepth);
+            log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                      "validate!\n\tExpected (%g,%g,%g,%g),\n\t     got "
+                      "(%g,%g,%g,%g), error of %g\n",
+                      j, x, x, y, y, z, z, (float)expected[0],
+                      (float)expected[1], (float)expected[2],
+                      (float)expected[3], (float)resultPtr[0],
+                      (float)resultPtr[1], (float)resultPtr[2],
+                      (float)resultPtr[3], error);
+        }
+        else
+        {
+            log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                      "validate!\n\tExpected (%x,%x,%x,%x),\n\t     got "
+                      "(%x,%x,%x,%x)\n",
+                      j, x, x, y, y, z, z, (int)expected[0], (int)expected[1],
+                      (int)expected[2], (int)expected[3], (int)resultPtr[0],
+                      (int)resultPtr[1], (int)resultPtr[2], (int)resultPtr[3]);
+        }
+        log_error(
+            "Integer coords resolve to %d,%d,%d   with img size %d,%d,%d\n",
+            clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight,
+            (int)imageDepth);
-            if (printAsFloat && gExtraValidateInfo)
+        if (printAsFloat && gExtraValidateInfo)
+        {
+            log_error("\nNearby values:\n");
+            for (int zOff = -1; zOff <= 1; zOff++)
-                log_error("\nNearby values:\n");
-                for (int zOff = -1; zOff <= 1; zOff++)
+                for (int yOff = -1; yOff <= 1; yOff++)
-                    for (int yOff = -1; yOff <= 1; yOff++)
-                    {
-                        float top[4], real[4], bot[4];
-                        read_image_pixel_float(imagePtr, imageInfo,
-                                               clampedX - 1, clampedY + yOff,
-                                               clampedZ + zOff, top);
-                        read_image_pixel_float(imagePtr, imageInfo, clampedX,
-                                               clampedY + yOff, clampedZ + zOff,
-                                               real);
-                        read_image_pixel_float(imagePtr, imageInfo,
-                                               clampedX + 1, clampedY + yOff,
-                                               clampedZ + zOff, bot);
-                        log_error("\t(%g,%g,%g,%g)", top[0], top[1], top[2],
-                                  top[3]);
-                        log_error(" (%g,%g,%g,%g)", real[0], real[1], real[2],
-                                  real[3]);
-                        log_error(" (%g,%g,%g,%g)\n", bot[0], bot[1], bot[2],
-                                  bot[3]);
-                    }
+                    float top[4], real[4], bot[4];
+                    read_image_pixel_float(imagePtr, imageInfo, clampedX - 1,
+                                           clampedY + yOff, clampedZ + zOff,
+                                           top);
+                    read_image_pixel_float(imagePtr, imageInfo, clampedX,
+                                           clampedY + yOff, clampedZ + zOff,
+                                           real);
+                    read_image_pixel_float(imagePtr, imageInfo, clampedX + 1,
+                                           clampedY + yOff, clampedZ + zOff,
+                                           bot);
+                    log_error("\t(%g,%g,%g,%g)", top[0], top[1], top[2],
+                              top[3]);
+                    log_error(" (%g,%g,%g,%g)", real[0], real[1], real[2],
+                              real[3]);
+                    log_error(" (%g,%g,%g,%g)\n", bot[0], bot[1], bot[2],
+                              bot[3]);
-            //        }
-            //        else
-            //            log_error( "\n" );
-            if (imageSampler->filter_mode != CL_FILTER_LINEAR)
-            {
-                if (found)
-                    log_error(
-                        "\tValue really found in image at %d,%d,%d (%s)\n",
-                        actualX, actualY, actualZ,
-                        (found > 1) ? "NOT unique!!" : "unique");
-                else
-                    log_error("\tValue not actually found in image\n");
-            }
-            log_error("\n");
+        if (imageSampler->filter_mode != CL_FILTER_LINEAR)
+        {
+            if (found)
+                log_error("\tValue really found in image at %d,%d,%d (%s)\n",
+                          actualX, actualY, actualZ,
+                          (found > 1) ? "NOT unique!!" : "unique");
+            else
+                log_error("\tValue not actually found in image\n");
+        }
+        log_error("\n");
         numClamped = -1; // We force the clamped counter to never work
         if ((--numTries) == 0) return -1;
     return 0;
+extern int filter_rounding_errors(int forceCorrectlyRoundedWrites,
+                                  image_descriptor *imageInfo, float *errors);
+extern void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr);
diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp
index 03ca959..05aed02 100644
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -16,6 +16,8 @@
 #include "test_common.h"
 #include <float.h>
+#include <algorithm>
 #if defined( __APPLE__ )
     #include <signal.h>
     #include <sys/signal.h>
@@ -37,24 +39,28 @@
 const char *read2DKernelSourcePattern =
-"__kernel void sample_kernel( read_only %s input,%s __global float *xOffsets, __global float *yOffsets, __global %s%s *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_only %s input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global %s%s *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "}";
 const char *read_write2DKernelSourcePattern =
-"__kernel void sample_kernel( read_write %s input,%s __global float *xOffsets, __global float *yOffsets, __global %s%s *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   results[offset] = read_image%s( input, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_write %s input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global %s%s *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, coords %s);\n"
+    "}";
 const char *intCoordKernelSource =
 "   int2 coords = (int2)( xOffsets[offset], yOffsets[offset]);\n";
@@ -413,12 +419,15 @@
                 int checkOnlyOnePixel = 0;
                 int found_pixel = 0;
                 float offset = NORM_OFFSET;
-                if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                if (!imageSampler->normalized_coords
+                    || imageSampler->filter_mode != CL_FILTER_NEAREST
+                    || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                    // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                    || gDeviceType != CL_DEVICE_TYPE_GPU
+                    // Apple requires its CPU implementation to do correctly
+                    // rounded address arithmetic in all modes
+                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                    )
+                )
                     offset = 0.0f;          // Loop only once
                 for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) {
@@ -434,7 +443,8 @@
                         float err1 = ABS_ERROR(resultPtr[0], expected[0]);
                         // Clamp to the minimum absolute error for the format
                         if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                        float maxErr1 =
+                            std::max(maxErr * maxPixel.p[0], FLT_MIN);
                         // Check if the result matches.
                         if( ! (err1 <= maxErr1) )
@@ -471,7 +481,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -484,7 +497,8 @@
                                                                                     imageSampler, expected, 0, &containsDenormals );
                             float err1 = ABS_ERROR(resultPtr[0], expected[0]);
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
                             if( ! (err1 <= maxErr1) )
@@ -565,12 +579,15 @@
                 int checkOnlyOnePixel = 0;
                 int found_pixel = 0;
                 float offset = NORM_OFFSET;
-                if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                if (!imageSampler->normalized_coords
+                    || imageSampler->filter_mode != CL_FILTER_NEAREST
+                    || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                    // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                    || gDeviceType != CL_DEVICE_TYPE_GPU
+                    // Apple requires its CPU implementation to do correctly
+                    // rounded address arithmetic in all modes
+                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                    )
+                )
                     offset = 0.0f;          // Loop only once
                 for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) {
@@ -598,10 +615,14 @@
                         if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                         if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                         if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                        float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                        float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                        float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                        float maxErr1 =
+                            std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                        float maxErr2 =
+                            std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                        float maxErr3 =
+                            std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                        float maxErr4 =
+                            std::max(maxErr * maxPixel.p[3], FLT_MIN);
                         // Check if the result matches.
                         if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -650,7 +671,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -671,10 +695,14 @@
                             float err2 = ABS_ERROR(resultPtr[1], expected[1]);
                             float err3 = ABS_ERROR(resultPtr[2], expected[2]);
                             float err4 = ABS_ERROR(resultPtr[3], expected[3]);
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -766,7 +794,10 @@
                         // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                         // E.g., test one pixel.
-                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                        if (!imageSampler->normalized_coords
+                            || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                            || NORM_OFFSET == 0)
+                        {
                             norm_offset_x = 0.0f;
                             norm_offset_y = 0.0f;
                             checkOnlyOnePixel = 1;
@@ -801,7 +832,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -862,7 +896,10 @@
                         // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                         // E.g., test one pixel.
-                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                        if (!imageSampler->normalized_coords
+                            || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                            || NORM_OFFSET == 0)
+                        {
                             norm_offset_x = 0.0f;
                             norm_offset_y = 0.0f;
                             checkOnlyOnePixel = 1;
@@ -897,7 +934,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -963,12 +1003,15 @@
                 int checkOnlyOnePixel = 0;
                 int found_pixel = 0;
                 float offset = NORM_OFFSET;
-                if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                if (!imageSampler->normalized_coords
+                    || imageSampler->filter_mode != CL_FILTER_NEAREST
+                    || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                    // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                    || gDeviceType != CL_DEVICE_TYPE_GPU
+                    // Apple requires its CPU implementation to do correctly
+                    // rounded address arithmetic in all modes
+                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                    )
+                )
                     offset = 0.0f;          // Loop only once
                 for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) {
@@ -1042,7 +1085,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -1649,16 +1695,18 @@
-    sprintf( programSrc, KernelSourcePattern,
-            (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" : "image2d_t",
-            samplerArg, get_explicit_type_name( outputType ),
+    sprintf(programSrc, KernelSourcePattern,
+            gTestMipmaps
+                ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable"
+                : "",
+            (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t"
+                                                      : "image2d_t",
+            samplerArg, get_explicit_type_name(outputType),
             (format->image_channel_order == CL_DEPTH) ? "" : "4",
-            gTestMipmaps?", float lod":" ",
-            samplerVar,
-            gTestMipmaps? lodOffsetSource : offsetSource,
-            floatCoords ? floatKernelSource : intCoordKernelSource,
-            readFormat,
-            gTestMipmaps?", lod":" ");
+            gTestMipmaps ? ", float lod" : " ", samplerVar,
+            gTestMipmaps ? lodOffsetSource : offsetSource,
+            floatCoords ? floatKernelSource : intCoordKernelSource, readFormat,
+            gTestMipmaps ? ", lod" : " ");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_loops.cpp b/test_conformance/images/kernel_read_write/test_loops.cpp
index 795a9ed..ea1e1c7 100644
--- a/test_conformance/images/kernel_read_write/test_loops.cpp
+++ b/test_conformance/images/kernel_read_write/test_loops.cpp
@@ -84,7 +84,7 @@
     // of operations for linear filtering on the GPU.  We do not test linear
     // filtering for the CL_RGB CL_UNORM_INT_101010 image format; however, we
     // test it internally for a set of other image formats.
-    if ((gDeviceType == CL_DEVICE_TYPE_GPU)
+    if ((gDeviceType & CL_DEVICE_TYPE_GPU)
         && (imageSampler->filter_mode == CL_FILTER_LINEAR)
         && (format->image_channel_order == CL_RGB)
         && (format->image_channel_data_type == CL_UNORM_INT_101010))
diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp
index c9ba4e8..2a72208 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp
@@ -17,6 +17,8 @@
 #include "test_common.h"
 #include <float.h>
+#include <algorithm>
 #if defined( __APPLE__ )
     #include <signal.h>
     #include <sys/signal.h>
@@ -24,24 +26,28 @@
 const char *read1DKernelSourcePattern =
-"__kernel void sample_kernel( read_only image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n"
-"   int tidX = get_global_id(0);\n"
-"   int offset = tidX;\n"
-"   results[offset] = read_image%s( input, imageSampler, coord %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_only image1d_t input,%s __global float "
+    "*xOffsets, __global %s4 *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0);\n"
+    "   int offset = tidX;\n"
+    "%s"
+    "   results[offset] = read_image%s( input, imageSampler, coord %s);\n"
+    "}";
 const char *read_write1DKernelSourcePattern =
-"__kernel void sample_kernel( read_write image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n"
-"   int tidX = get_global_id(0);\n"
-"   int offset = tidX;\n"
-"   results[offset] = read_image%s( input, coord %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_write image1d_t input,%s __global float "
+    "*xOffsets, __global %s4 *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0);\n"
+    "   int offset = tidX;\n"
+    "%s"
+    "   results[offset] = read_image%s( input, coord %s);\n"
+    "}";
 const char *int1DCoordKernelSource =
 "   int coord = xOffsets[offset];\n";
@@ -485,10 +491,13 @@
                     int checkOnlyOnePixel = 0;
                     int found_pixel = 0;
                     float offset = NORM_OFFSET;
-                    if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                    if (!imageSampler->normalized_coords
+                        || imageSampler->filter_mode != CL_FILTER_NEAREST
+                        || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                        // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                        || gDeviceType != CL_DEVICE_TYPE_GPU
+                        // Apple requires its CPU implementation to do correctly
+                        // rounded address arithmetic in all modes
+                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
                         offset = 0.0f;          // Loop only once
@@ -551,7 +560,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -644,10 +656,13 @@
                     int checkOnlyOnePixel = 0;
                     int found_pixel = 0;
                     float offset = NORM_OFFSET;
-                    if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                    if (!imageSampler->normalized_coords
+                        || imageSampler->filter_mode != CL_FILTER_NEAREST
+                        || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                        // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                        || gDeviceType != CL_DEVICE_TYPE_GPU
+                        // Apple requires its CPU implementation to do correctly
+                        // rounded address arithmetic in all modes
+                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
                         offset = 0.0f;          // Loop only once
@@ -669,10 +684,14 @@
                             if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                             if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                             if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
                             // Check if the result matches.
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -714,7 +733,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -732,10 +754,14 @@
                                     ABS_ERROR(resultPtr[2], expected[2]);
                                 float err4 =
                                     ABS_ERROR(resultPtr[3], expected[3]);
-                                float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                float maxErr1 =
+                                    std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                                float maxErr2 =
+                                    std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                                float maxErr3 =
+                                    std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                                float maxErr4 =
+                                    std::max(maxErr * maxPixel.p[3], FLT_MIN);
                                 if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -816,7 +842,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -847,7 +876,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -903,7 +935,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -934,7 +969,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -1041,14 +1079,14 @@
         KernelSourcePattern = read1DKernelSourcePattern;
-    sprintf( programSrc,
-            KernelSourcePattern,
-            samplerArg, get_explicit_type_name( outputType ),
-            gTestMipmaps ? ", float lod" : "",
-            samplerVar,
+    sprintf(programSrc, KernelSourcePattern,
+            gTestMipmaps
+                ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable"
+                : "",
+            samplerArg, get_explicit_type_name(outputType),
+            gTestMipmaps ? ", float lod" : "", samplerVar,
             floatCoords ? float1DKernelSource : int1DCoordKernelSource,
-            readFormat,
-            gTestMipmaps ? ", lod" : "" );
+            readFormat, gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
index b3287de..a800942 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
@@ -16,32 +16,37 @@
 #include "test_common.h"
 #include <float.h>
+#include <algorithm>
 #if defined( __APPLE__ )
 #include <signal.h>
 #include <sys/signal.h>
 #include <setjmp.h>
 const char *read1DArrayKernelSourcePattern =
-"__kernel void sample_kernel( read_only image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_only image1d_array_t input,%s __global "
+    "float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "}";
 const char *read_write1DArrayKernelSourcePattern =
-"__kernel void sample_kernel( read_write image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s )\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   results[offset] = read_image%s( input, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_write image1d_array_t input,%s __global "
+    "float *xOffsets, __global float *yOffsets, __global %s4 *results %s )\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, coords %s);\n"
+    "}";
 const char *offset1DArrayKernelSource =
 "   int offset = tidY*get_image_width(input) + tidX;\n";
@@ -577,12 +582,15 @@
                     int checkOnlyOnePixel = 0;
                     int found_pixel = 0;
                     float offset = NORM_OFFSET;
-                    if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                    if (!imageSampler->normalized_coords
+                        || imageSampler->filter_mode != CL_FILTER_NEAREST
+                        || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                        // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                        || gDeviceType != CL_DEVICE_TYPE_GPU
+                        // Apple requires its CPU implementation to do correctly
+                        // rounded address arithmetic in all modes
+                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                        )
+                    )
                         offset = 0.0f;          // Loop only once
                     for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) {
@@ -646,7 +654,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     norm_offset_y = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -745,12 +756,15 @@
                     int checkOnlyOnePixel = 0;
                     int found_pixel = 0;
                     float offset = NORM_OFFSET;
-                    if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                    if (!imageSampler->normalized_coords
+                        || imageSampler->filter_mode != CL_FILTER_NEAREST
+                        || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                        // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                        || gDeviceType != CL_DEVICE_TYPE_GPU
+                        // Apple requires its CPU implementation to do correctly
+                        // rounded address arithmetic in all modes
+                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                        )
+                    )
                         offset = 0.0f;          // Loop only once
                     for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) {
@@ -772,10 +786,14 @@
                             if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                             if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                             if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
                             // Check if the result matches.
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -819,7 +837,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     norm_offset_y = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -838,10 +859,14 @@
                                     ABS_ERROR(resultPtr[2], expected[2]);
                                 float err4 =
                                     ABS_ERROR(resultPtr[3], expected[3]);
-                                float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                float maxErr1 =
+                                    std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                                float maxErr2 =
+                                    std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                                float maxErr3 =
+                                    std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                                float maxErr4 =
+                                    std::max(maxErr * maxPixel.p[3], FLT_MIN);
                                 if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -926,7 +951,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -956,7 +984,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     norm_offset_y = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -1012,7 +1043,10 @@
                             // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                             // E.g., test one pixel.
-                            if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                            if (!imageSampler->normalized_coords
+                                || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                || NORM_OFFSET == 0)
+                            {
                                 norm_offset_x = 0.0f;
                                 norm_offset_y = 0.0f;
                                 checkOnlyOnePixel = 1;
@@ -1042,7 +1076,10 @@
                                 // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                 // E.g., test one pixel.
-                                if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                if (!imageSampler->normalized_coords
+                                    || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                    || NORM_OFFSET == 0)
+                                {
                                     norm_offset_x = 0.0f;
                                     norm_offset_y = 0.0f;
                                     checkOnlyOnePixel = 1;
@@ -1147,15 +1184,15 @@
         KernelSourcePattern = read_write1DArrayKernelSourcePattern;
-    sprintf( programSrc,
-            KernelSourcePattern,
-            samplerArg, get_explicit_type_name( outputType ),
-            gTestMipmaps ? ", float lod" : "",
-            samplerVar,
-            gTestMipmaps ? offset1DArrayLodKernelSource : offset1DArrayKernelSource,
-            floatCoords ? floatKernelSource1DArray : intCoordKernelSource1DArray,
-            readFormat,
-            gTestMipmaps ? ", lod" : "" );
+    sprintf(
+        programSrc, KernelSourcePattern,
+        gTestMipmaps ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable"
+                     : "",
+        samplerArg, get_explicit_type_name(outputType),
+        gTestMipmaps ? ", float lod" : "", samplerVar,
+        gTestMipmaps ? offset1DArrayLodKernelSource : offset1DArrayKernelSource,
+        floatCoords ? floatKernelSource1DArray : intCoordKernelSource1DArray,
+        readFormat, gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
index 7cb334b..533a0fe 100644
--- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
@@ -16,6 +16,8 @@
 #include "test_common.h"
 #include <float.h>
+#include <algorithm>
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
 static size_t reduceImageSizeRange(size_t maxDimSize) {
@@ -39,24 +41,32 @@
 const char *read2DArrayKernelSourcePattern =
-"__kernel void sample_kernel( read_only %s input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets,  __global %s%s *results %s )\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_only %s input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global float *zOffsets,  __global "
+    "%s%s *results %s )\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "}";
 const char *read_write2DArrayKernelSourcePattern =
-"__kernel void sample_kernel( read_write %s input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets,  __global %s%s *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   results[offset] = read_image%s( input, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_write %s input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global float *zOffsets,  __global "
+    "%s%s *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, coords %s);\n"
+    "}";
 const char* offset2DarraySource ="   int offset = tidZ*get_image_width(input)*get_image_height(input) + tidY*get_image_width(input) + tidX;\n";
 const char* offset2DarraySourceLod =
@@ -595,12 +605,15 @@
                         int checkOnlyOnePixel = 0;
                         int found_pixel = 0;
                         float offset = NORM_OFFSET;
-                        if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                        if (!imageSampler->normalized_coords
+                            || imageSampler->filter_mode != CL_FILTER_NEAREST
+                            || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                            // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                            // Apple requires its CPU implementation to do
+                            // correctly rounded address arithmetic in all modes
+                            || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                            )
+                        )
                             offset = 0.0f;          // Loop only once
                         for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) {
@@ -617,7 +630,8 @@
                                         ABS_ERROR(resultPtr[0], expected[0]);
                                     // Clamp to the minimum absolute error for the format
                                     if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                                    float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                                    float maxErr1 = std::max(
+                                        maxErr * maxPixel.p[0], FLT_MIN);
                                     if( ! (err1 <= maxErr1) )
@@ -661,7 +675,8 @@
                                         float err1 = ABS_ERROR(resultPtr[0],
-                                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                                        float maxErr1 = std::max(
+                                            maxErr * maxPixel.p[0], FLT_MIN);
                                         if( ! (err1 <= maxErr1) )
@@ -734,12 +749,15 @@
                         int checkOnlyOnePixel = 0;
                         int found_pixel = 0;
                         float offset = NORM_OFFSET;
-                        if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                        if (!imageSampler->normalized_coords
+                            || imageSampler->filter_mode != CL_FILTER_NEAREST
+                            || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                            // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                            // Apple requires its CPU implementation to do
+                            // correctly rounded address arithmetic in all modes
+                            || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                            )
+                        )
                             offset = 0.0f;          // Loop only once
                         for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) {
@@ -911,12 +929,15 @@
                         int checkOnlyOnePixel = 0;
                         int found_pixel = 0;
                         float offset = NORM_OFFSET;
-                        if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
+                        if (!imageSampler->normalized_coords
+                            || imageSampler->filter_mode != CL_FILTER_NEAREST
+                            || NORM_OFFSET == 0
 #if defined( __APPLE__ )
-                            // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                            // Apple requires its CPU implementation to do
+                            // correctly rounded address arithmetic in all modes
+                            || !(gDeviceType & CL_DEVICE_TYPE_GPU)
-                            )
+                        )
                             offset = 0.0f;          // Loop only once
                         for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) {
@@ -942,10 +963,14 @@
                                     if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                                     if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                                     if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                                    float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                    float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                    float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                    float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                    float maxErr1 = std::max(
+                                        maxErr * maxPixel.p[0], FLT_MIN);
+                                    float maxErr2 = std::max(
+                                        maxErr * maxPixel.p[1], FLT_MIN);
+                                    float maxErr3 = std::max(
+                                        maxErr * maxPixel.p[2], FLT_MIN);
+                                    float maxErr4 = std::max(
+                                        maxErr * maxPixel.p[3], FLT_MIN);
                                     if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
@@ -1004,10 +1029,14 @@
                                         float err4 = ABS_ERROR(resultPtr[3],
-                                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                        float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                        float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                        float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                        float maxErr1 = std::max(
+                                            maxErr * maxPixel.p[0], FLT_MIN);
+                                        float maxErr2 = std::max(
+                                            maxErr * maxPixel.p[1], FLT_MIN);
+                                        float maxErr3 = std::max(
+                                            maxErr * maxPixel.p[2], FLT_MIN);
+                                        float maxErr4 = std::max(
+                                            maxErr * maxPixel.p[3], FLT_MIN);
                                         if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
@@ -1096,7 +1125,10 @@
                                     // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                     // E.g., test one pixel.
-                                    if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                    if (!imageSampler->normalized_coords
+                                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                        || NORM_OFFSET == 0)
+                                    {
                                         norm_offset_x = 0.0f;
                                         norm_offset_y = 0.0f;
                                         norm_offset_z = 0.0f;
@@ -1135,7 +1167,11 @@
                                         // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                         // E.g., test one pixel.
-                                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                        if (!imageSampler->normalized_coords
+                                            || !(gDeviceType
+                                                 & CL_DEVICE_TYPE_GPU)
+                                            || NORM_OFFSET == 0)
+                                        {
                                             norm_offset_x = 0.0f;
                                             norm_offset_y = 0.0f;
                                             norm_offset_z = 0.0f;
@@ -1204,7 +1240,10 @@
                                     // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                     // E.g., test one pixel.
-                                    if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
+                                    if (!imageSampler->normalized_coords
+                                        || !(gDeviceType & CL_DEVICE_TYPE_GPU)
+                                        || NORM_OFFSET == 0)
+                                    {
                                         norm_offset_x = 0.0f;
                                         norm_offset_y = 0.0f;
                                         norm_offset_z = 0.0f;
@@ -1243,7 +1282,11 @@
                                         // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
                                         // E.g., test one pixel.
-                                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0 || NORM_OFFSET == 0 || NORM_OFFSET == 0) {
+                                        if (!imageSampler->normalized_coords
+                                            || !(gDeviceType
+                                                 & CL_DEVICE_TYPE_GPU)
+                                            || NORM_OFFSET == 0)
+                                        {
                                             norm_offset_x = 0.0f;
                                             norm_offset_y = 0.0f;
                                             norm_offset_z = 0.0f;
@@ -1377,17 +1420,16 @@
     // Construct the source
-    sprintf( programSrc,
-            KernelSourcePattern,
-            imageType,
-            samplerArg, get_explicit_type_name( outputType ),
-            imageElement,
-            gTestMipmaps ? ", float lod" : " ",
-            samplerVar,
+    sprintf(programSrc, KernelSourcePattern,
+            gTestMipmaps
+                ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable"
+                : "",
+            imageType, samplerArg, get_explicit_type_name(outputType),
+            imageElement, gTestMipmaps ? ", float lod" : " ", samplerVar,
             gTestMipmaps ? offset2DarraySourceLod : offset2DarraySource,
-            floatCoords ? float2DArrayUnnormalizedCoordKernelSource : int2DArrayCoordKernelSource,
-            readFormat,
-            gTestMipmaps ? ", lod" : " " );
+            floatCoords ? float2DArrayUnnormalizedCoordKernelSource
+                        : int2DArrayCoordKernelSource,
+            readFormat, gTestMipmaps ? ", lod" : " ");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_read_3D.cpp b/test_conformance/images/kernel_read_write/test_read_3D.cpp
index 860114f..cec77bf 100644
--- a/test_conformance/images/kernel_read_write/test_read_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_3D.cpp
@@ -36,24 +36,32 @@
 const char *read3DKernelSourcePattern =
-"__kernel void sample_kernel( read_only image3d_t input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets,  __global %s4 *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_only image3d_t input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global float *zOffsets,  __global "
+    "%s4 *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, imageSampler, coords %s);\n"
+    "}";
 const char *read_write3DKernelSourcePattern =
-"__kernel void sample_kernel( read_write image3d_t input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets,  __global %s4 *results %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   results[offset] = read_image%s( input, coords %s);\n"
+    "%s\n"
+    "__kernel void sample_kernel( read_write image3d_t input,%s __global float "
+    "*xOffsets, __global float *yOffsets, __global float *zOffsets,  __global "
+    "%s4 *results %s)\n"
+    "{\n"
+    "%s"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "%s"
+    "   results[offset] = read_image%s( input, coords %s);\n"
+    "}";
 const char *offset3DKernelSource =
 "   int offset = tidZ*get_image_width(input)*get_image_height(input) + tidY*get_image_width(input) + tidX;\n";
@@ -137,15 +145,16 @@
         KernelSourcePattern = read_write3DKernelSourcePattern;
-    sprintf( programSrc,
-            KernelSourcePattern,
-            samplerArg, get_explicit_type_name( outputType ),
-            gTestMipmaps? ", float lod": " ",
-            samplerVar,
-            gTestMipmaps? offset3DLodKernelSource: offset3DKernelSource,
-            floatCoords ? float3DUnnormalizedCoordKernelSource : int3DCoordKernelSource,
-            readFormat,
-            gTestMipmaps? ",lod":" ");
+    sprintf(programSrc, KernelSourcePattern,
+            gTestMipmaps
+                ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable"
+                : "",
+            samplerArg, get_explicit_type_name(outputType),
+            gTestMipmaps ? ", float lod" : " ", samplerVar,
+            gTestMipmaps ? offset3DLodKernelSource : offset3DKernelSource,
+            floatCoords ? float3DUnnormalizedCoordKernelSource
+                        : int3DCoordKernelSource,
+            readFormat, gTestMipmaps ? ",lod" : " ");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_write_1D.cpp b/test_conformance/images/kernel_read_write/test_write_1D.cpp
index 41983ed..5f72679 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "../testBase.h"
+#include "test_common.h"
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -26,20 +27,24 @@
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor* imageInfo );
 const char *readwrite1DKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, read_write image1d_t output %s)\n"
-"   int tidX = get_global_id(0);\n"
-"   int offset = tidX;\n"
-"   write_image%s( output, tidX %s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, read_write image1d_t "
+    "output %s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0);\n"
+    "   int offset = tidX;\n"
+    "   write_image%s( output, tidX %s, input[ offset ]);\n"
+    "}";
 const char *write1DKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, write_only image1d_t output %s)\n"
-"   int tidX = get_global_id(0);\n"
-"   int offset = tidX;\n"
-"   write_image%s( output, tidX %s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, write_only image1d_t "
+    "output %s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0);\n"
+    "   int offset = tidX;\n"
+    "   write_image%s( output, tidX %s, input[ offset ]);\n"
+    "}";
 int test_write_image_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_kernel kernel,
                      image_descriptor *imageInfo, ExplicitType inputType, MTdata d )
@@ -395,6 +400,8 @@
+                        filter_undefined_bits(imageInfo, resultPtr);
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
@@ -403,21 +410,8 @@
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
                             if( failure )
@@ -458,6 +452,56 @@
                                         log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                         log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                     case CL_UNORM_INT16:
                                     case CL_SNORM_INT16:
                                     case CL_UNSIGNED_INT16:
@@ -574,12 +618,14 @@
         KernelSourcePattern = readwrite1DKernelSourcePattern;
-    sprintf( programSrc,
-             KernelSourcePattern,
-             get_explicit_type_name( inputType ),
-             gTestMipmaps ? ", int lod" : "",
-             readFormat,
-             gTestMipmaps ? ", lod" :"" );
+    sprintf(
+        programSrc, KernelSourcePattern,
+        gTestMipmaps
+            ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma "
+              "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable"
+            : "",
+        get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "",
+        readFormat, gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
index c771704..f902440 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "../testBase.h"
+#include "test_common.h"
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -26,20 +27,24 @@
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
 const char *readwrite1DArrayKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, read_write image1d_array_t output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, read_write "
+    "image1d_array_t output %s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "   write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ]);\n"
+    "}";
 const char *write1DArrayKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, write_only image1d_array_t output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, write_only "
+    "image1d_array_t output %s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "   write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n"
+    "}";
 const char *offset1DArraySource =
 "   int offset = tidY*get_image_width(output) + tidX;\n";
@@ -415,6 +420,9 @@
+                        filter_undefined_bits(imageInfo, resultPtr);
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, pixelSize ) != 0 )
@@ -423,21 +431,8 @@
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
                             if( failure )
@@ -478,6 +473,56 @@
                                         log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                         log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                     case CL_UNORM_INT16:
                                     case CL_SNORM_INT16:
                                     case CL_UNSIGNED_INT16:
@@ -596,13 +641,15 @@
     // Construct the source
     // Construct the source
-    sprintf( programSrc,
-             KernelSourcePattern,
-             get_explicit_type_name( inputType ),
-             gTestMipmaps ? ", int lod" : "",
-             gTestMipmaps ? offset1DArrayLodSource : offset1DArraySource,
-             readFormat,
-             gTestMipmaps ? ", lod" :"" );
+    sprintf(
+        programSrc, KernelSourcePattern,
+        gTestMipmaps
+            ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma "
+              "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable"
+            : "",
+        get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "",
+        gTestMipmaps ? offset1DArrayLodSource : offset1DArraySource, readFormat,
+        gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
index 08a7a80..c1c5699 100644
--- a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "../testBase.h"
+#include "test_common.h"
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -48,20 +49,28 @@
 const char *write2DArrayKernelSourcePattern =
-"__kernel void sample_kernel( __global %s%s *input, write_only %s output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s%s *input, write_only %s output "
+    "%s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset "
+    "]);\n"
+    "}";
 const char *readwrite2DArrayKernelSourcePattern =
-"__kernel void sample_kernel( __global %s%s *input, read_write %s output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ] );\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s%s *input, read_write %s output "
+    "%s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset "
+    "] );\n"
+    "}";
 const char *offset2DArrayKernelSource =
 "   int offset = tidZ*get_image_width(output)*get_image_height(output) + tidY*get_image_width(output) + tidX;\n";
@@ -438,6 +447,9 @@
+                            filter_undefined_bits(imageInfo, resultPtr);
                             // Exact result passes every time
                             if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
@@ -446,21 +458,9 @@
                                 float errors[4] = {NAN, NAN, NAN, NAN};
                                 pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
-                                // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                                if( 0 == forceCorrectlyRoundedWrites    &&
-                                   (
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                    ))
-                                {
-                                    if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                       ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                        failure = 0;
-                                }
+                                failure = filter_rounding_errors(
+                                    forceCorrectlyRoundedWrites, imageInfo,
+                                    errors);
                                 if( failure )
@@ -501,6 +501,64 @@
                                             log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                             log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
+                                        case CL_UNORM_SHORT_565: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x3F,
+                                                (ref_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x3F,
+                                                (test_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
+                                        case CL_UNORM_SHORT_555: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x1F,
+                                                (ref_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x1F,
+                                                (test_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
                                         case CL_UNORM_INT16:
                                         case CL_SNORM_INT16:
                                         case CL_UNSIGNED_INT16:
@@ -621,15 +679,19 @@
     // Construct the source
     // Construct the source
-    sprintf( programSrc,
-             KernelSourcePattern,
-             get_explicit_type_name( inputType ),
-             (format->image_channel_order == CL_DEPTH) ? "" : "4",
-             (format->image_channel_order == CL_DEPTH) ? "image2d_array_depth_t" : "image2d_array_t",
-             gTestMipmaps ? " , int lod" : "",
-             gTestMipmaps ? offset2DArrayLodKernelSource : offset2DArrayKernelSource,
-             readFormat,
-             gTestMipmaps ? ", lod" : "" );
+    sprintf(
+        programSrc, KernelSourcePattern,
+        gTestMipmaps
+            ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma "
+              "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable"
+            : "",
+        get_explicit_type_name(inputType),
+        (format->image_channel_order == CL_DEPTH) ? "" : "4",
+        (format->image_channel_order == CL_DEPTH) ? "image2d_array_depth_t"
+                                                  : "image2d_array_t",
+        gTestMipmaps ? " , int lod" : "",
+        gTestMipmaps ? offset2DArrayLodKernelSource : offset2DArrayKernelSource,
+        readFormat, gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_write_3D.cpp b/test_conformance/images/kernel_read_write/test_write_3D.cpp
index 5cc96bb..9da9369 100644
--- a/test_conformance/images/kernel_read_write/test_write_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_3D.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "../testBase.h"
+#include "test_common.h"
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -45,22 +46,30 @@
 const char *write3DKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, write_only image3d_t output %s )\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n"
+    "%s"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, write_only image3d_t "
+    "output %s )\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset "
+    "]);\n"
+    "}";
 const char *readwrite3DKernelSourcePattern =
-"__kernel void sample_kernel( __global %s4 *input, read_write image3d_t output %s )\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n"
-"   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n"
+    "%s"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s4 *input, read_write image3d_t "
+    "output %s )\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = "
+    "get_global_id(2);\n"
+    "%s"
+    "   write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset "
+    "]);\n"
+    "}";
 const char *khr3DWritesPragma =
 "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
@@ -445,6 +454,9 @@
+                            filter_undefined_bits(imageInfo, resultPtr);
                             // Exact result passes every time
                             if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
@@ -453,21 +465,9 @@
                                 float errors[4] = {NAN, NAN, NAN, NAN};
                                 pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
-                                // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                                if( 0 == forceCorrectlyRoundedWrites    &&
-                                   (
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                    ))
-                                {
-                                    if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                       ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                        failure = 0;
-                                }
+                                failure = filter_rounding_errors(
+                                    forceCorrectlyRoundedWrites, imageInfo,
+                                    errors);
                                 if( failure )
@@ -508,6 +508,64 @@
                                             log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                             log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
+                                        case CL_UNORM_SHORT_565: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x3F,
+                                                (ref_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x3F,
+                                                (test_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
+                                        case CL_UNORM_SHORT_555: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x1F,
+                                                (ref_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x1F,
+                                                (test_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
                                         case CL_UNORM_INT16:
                                         case CL_SNORM_INT16:
                                         case CL_UNSIGNED_INT16:
@@ -628,14 +686,15 @@
     // Construct the source
-    sprintf( programSrc,
-             KernelSourcePattern,
-             gTestMipmaps ? "" : khr3DWritesPragma,
-             get_explicit_type_name( inputType ),
-             gTestMipmaps ? ", int lod" : "",
-             gTestMipmaps ? offset3DLodSource : offset3DSource,
-             readFormat,
-             gTestMipmaps ? ", lod" : "" );
+    sprintf(
+        programSrc, KernelSourcePattern, khr3DWritesPragma,
+        gTestMipmaps
+            ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma "
+              "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable"
+            : "",
+        get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "",
+        gTestMipmaps ? offset3DLodSource : offset3DSource, readFormat,
+        gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/kernel_read_write/test_write_image.cpp b/test_conformance/images/kernel_read_write/test_write_image.cpp
index e40e80d..2962697 100644
--- a/test_conformance/images/kernel_read_write/test_write_image.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_image.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "../testBase.h"
+#include "test_common.h"
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -46,20 +47,24 @@
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
 const char *writeKernelSourcePattern =
-"__kernel void sample_kernel( __global %s%s *input, write_only %s output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s%s *input, write_only %s output "
+    "%s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "   write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n"
+    "}";
 const char *read_writeKernelSourcePattern =
-"__kernel void sample_kernel( __global %s%s *input, read_write %s output %s)\n"
-"   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
-"   write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ] );\n"
+    "%s\n"
+    "__kernel void sample_kernel( __global %s%s *input, read_write %s output "
+    "%s)\n"
+    "{\n"
+    "   int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+    "%s"
+    "   write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ] );\n"
+    "}";
 const char *offset2DKernelSource =
 "   int offset = tidY*get_image_width(output) + tidX;\n";
@@ -477,6 +482,9 @@
+                        filter_undefined_bits(imageInfo, resultPtr);
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
@@ -485,21 +493,8 @@
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
                             if( failure )
@@ -577,6 +572,57 @@
                                         log_error( "    Actual:   %a %a %a %a\n", ((cl_float*)resultPtr)[0], ((cl_float*)resultPtr)[1], ((cl_float*)resultPtr)[2], ((cl_float*)resultPtr)[3] );
                                         log_error( "    Ulps:     %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                 float *v = (float *)(char *)imagePtr;
@@ -686,15 +732,19 @@
     // Construct the source
-    sprintf( programSrc,
-             KernelSourcePattern,
-             get_explicit_type_name( inputType ),
-             (format->image_channel_order == CL_DEPTH) ? "" : "4",
-             (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" : "image2d_t",
-             gTestMipmaps ? ", int lod" : "",
-             gTestMipmaps ? offset2DLodKernelSource : offset2DKernelSource,
-             readFormat,
-             gTestMipmaps ? ", lod" : "" );
+    sprintf(
+        programSrc, KernelSourcePattern,
+        gTestMipmaps
+            ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma "
+              "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable"
+            : "",
+        get_explicit_type_name(inputType),
+        (format->image_channel_order == CL_DEPTH) ? "" : "4",
+        (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t"
+                                                  : "image2d_t",
+        gTestMipmaps ? ", int lod" : "",
+        gTestMipmaps ? offset2DLodKernelSource : offset2DKernelSource,
+        readFormat, gTestMipmaps ? ", lod" : "");
     ptr = programSrc;
     error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
diff --git a/test_conformance/images/samplerlessReads/test_iterations.cpp b/test_conformance/images/samplerlessReads/test_iterations.cpp
index 55eaaf4..e2f89aa 100644
--- a/test_conformance/images/samplerlessReads/test_iterations.cpp
+++ b/test_conformance/images/samplerlessReads/test_iterations.cpp
@@ -215,6 +215,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // Determine types
diff --git a/test_conformance/images/samplerlessReads/test_read_1D.cpp b/test_conformance/images/samplerlessReads/test_read_1D.cpp
index aa261b7..6ed9910 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D.cpp
@@ -215,6 +215,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // Determine types
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
index fb0c263..677eb9f 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
@@ -214,6 +214,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // Determine types
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
index 7a3084d..c3a991a 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
@@ -219,6 +219,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // note: image_buffer test uses image1D for results validation.
diff --git a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
index 99f2426..8273f53 100644
--- a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
@@ -202,6 +202,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // Determine types
diff --git a/test_conformance/images/samplerlessReads/test_read_3D.cpp b/test_conformance/images/samplerlessReads/test_read_3D.cpp
index cf41140..0df46c8 100644
--- a/test_conformance/images/samplerlessReads/test_read_3D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_3D.cpp
@@ -206,6 +206,7 @@
     if (memSize > (cl_ulong)SIZE_MAX) {
       memSize = (cl_ulong)SIZE_MAX;
+      maxAllocSize = (cl_ulong)SIZE_MAX;
     // Determine types
diff --git a/test_conformance/integer_ops/CMakeLists.txt b/test_conformance/integer_ops/CMakeLists.txt
index a045ef8..5344eab 100644
--- a/test_conformance/integer_ops/CMakeLists.txt
+++ b/test_conformance/integer_ops/CMakeLists.txt
@@ -11,6 +11,7 @@
+    test_integer_dot_product.cpp
diff --git a/test_conformance/integer_ops/main.cpp b/test_conformance/integer_ops/main.cpp
index 00e9166..e57cffd 100644
--- a/test_conformance/integer_ops/main.cpp
+++ b/test_conformance/integer_ops/main.cpp
@@ -25,127 +25,129 @@
 test_definition test_list[] = {
-    ADD_TEST( integer_clz ),
-    ADD_TEST_VERSION( integer_ctz,  Version(2, 0)),
-    ADD_TEST( integer_hadd ),
-    ADD_TEST( integer_rhadd ),
-    ADD_TEST( integer_mul_hi ),
-    ADD_TEST( integer_rotate ),
-    ADD_TEST( integer_clamp ),
-    ADD_TEST( integer_mad_sat ),
-    ADD_TEST( integer_mad_hi ),
-    ADD_TEST( integer_min ),
-    ADD_TEST( integer_max ),
-    ADD_TEST( integer_upsample ),
+    ADD_TEST(integer_clz),
+    ADD_TEST_VERSION(integer_ctz, Version(2, 0)),
+    ADD_TEST(integer_hadd),
+    ADD_TEST(integer_rhadd),
+    ADD_TEST(integer_mul_hi),
+    ADD_TEST(integer_rotate),
+    ADD_TEST(integer_clamp),
+    ADD_TEST(integer_mad_sat),
+    ADD_TEST(integer_mad_hi),
+    ADD_TEST(integer_min),
+    ADD_TEST(integer_max),
+    ADD_TEST(integer_upsample),
-    ADD_TEST( integer_abs ),
-    ADD_TEST( integer_abs_diff ),
-    ADD_TEST( integer_add_sat ),
-    ADD_TEST( integer_sub_sat ),
+    ADD_TEST(integer_abs),
+    ADD_TEST(integer_abs_diff),
+    ADD_TEST(integer_add_sat),
+    ADD_TEST(integer_sub_sat),
-    ADD_TEST( integer_addAssign ),
-    ADD_TEST( integer_subtractAssign ),
-    ADD_TEST( integer_multiplyAssign ),
-    ADD_TEST( integer_divideAssign ),
-    ADD_TEST( integer_moduloAssign ),
-    ADD_TEST( integer_andAssign ),
-    ADD_TEST( integer_orAssign ),
-    ADD_TEST( integer_exclusiveOrAssign ),
+    ADD_TEST(integer_addAssign),
+    ADD_TEST(integer_subtractAssign),
+    ADD_TEST(integer_multiplyAssign),
+    ADD_TEST(integer_divideAssign),
+    ADD_TEST(integer_moduloAssign),
+    ADD_TEST(integer_andAssign),
+    ADD_TEST(integer_orAssign),
+    ADD_TEST(integer_exclusiveOrAssign),
-    ADD_TEST( unary_ops_increment ),
-    ADD_TEST( unary_ops_decrement ),
-    ADD_TEST( unary_ops_full ),
+    ADD_TEST(unary_ops_increment),
+    ADD_TEST(unary_ops_decrement),
+    ADD_TEST(unary_ops_full),
-    ADD_TEST( integer_mul24 ),
-    ADD_TEST( integer_mad24 ),
+    ADD_TEST(integer_mul24),
+    ADD_TEST(integer_mad24),
-    ADD_TEST( long_math ),
-    ADD_TEST( long_logic ),
-    ADD_TEST( long_shift ),
-    ADD_TEST( long_compare ),
+    ADD_TEST(long_math),
+    ADD_TEST(long_logic),
+    ADD_TEST(long_shift),
+    ADD_TEST(long_compare),
-    ADD_TEST( ulong_math ),
-    ADD_TEST( ulong_logic ),
-    ADD_TEST( ulong_shift ),
-    ADD_TEST( ulong_compare ),
+    ADD_TEST(ulong_math),
+    ADD_TEST(ulong_logic),
+    ADD_TEST(ulong_shift),
+    ADD_TEST(ulong_compare),
-    ADD_TEST( int_math ),
-    ADD_TEST( int_logic ),
-    ADD_TEST( int_shift ),
-    ADD_TEST( int_compare ),
+    ADD_TEST(int_math),
+    ADD_TEST(int_logic),
+    ADD_TEST(int_shift),
+    ADD_TEST(int_compare),
-    ADD_TEST( uint_math ),
-    ADD_TEST( uint_logic ),
-    ADD_TEST( uint_shift ),
-    ADD_TEST( uint_compare ),
+    ADD_TEST(uint_math),
+    ADD_TEST(uint_logic),
+    ADD_TEST(uint_shift),
+    ADD_TEST(uint_compare),
-    ADD_TEST( short_math ),
-    ADD_TEST( short_logic ),
-    ADD_TEST( short_shift ),
-    ADD_TEST( short_compare ),
+    ADD_TEST(short_math),
+    ADD_TEST(short_logic),
+    ADD_TEST(short_shift),
+    ADD_TEST(short_compare),
-    ADD_TEST( ushort_math ),
-    ADD_TEST( ushort_logic ),
-    ADD_TEST( ushort_shift ),
-    ADD_TEST( ushort_compare ),
+    ADD_TEST(ushort_math),
+    ADD_TEST(ushort_logic),
+    ADD_TEST(ushort_shift),
+    ADD_TEST(ushort_compare),
-    ADD_TEST( char_math ),
-    ADD_TEST( char_logic ),
-    ADD_TEST( char_shift ),
-    ADD_TEST( char_compare ),
+    ADD_TEST(char_math),
+    ADD_TEST(char_logic),
+    ADD_TEST(char_shift),
+    ADD_TEST(char_compare),
-    ADD_TEST( uchar_math ),
-    ADD_TEST( uchar_logic ),
-    ADD_TEST( uchar_shift ),
-    ADD_TEST( uchar_compare ),
+    ADD_TEST(uchar_math),
+    ADD_TEST(uchar_logic),
+    ADD_TEST(uchar_shift),
+    ADD_TEST(uchar_compare),
-    ADD_TEST( popcount ),
+    ADD_TEST(popcount),
     // Quick
-    ADD_TEST( quick_long_math ),
-    ADD_TEST( quick_long_logic ),
-    ADD_TEST( quick_long_shift ),
-    ADD_TEST( quick_long_compare ),
+    ADD_TEST(quick_long_math),
+    ADD_TEST(quick_long_logic),
+    ADD_TEST(quick_long_shift),
+    ADD_TEST(quick_long_compare),
-    ADD_TEST( quick_ulong_math ),
-    ADD_TEST( quick_ulong_logic ),
-    ADD_TEST( quick_ulong_shift ),
-    ADD_TEST( quick_ulong_compare ),
+    ADD_TEST(quick_ulong_math),
+    ADD_TEST(quick_ulong_logic),
+    ADD_TEST(quick_ulong_shift),
+    ADD_TEST(quick_ulong_compare),
-    ADD_TEST( quick_int_math ),
-    ADD_TEST( quick_int_logic ),
-    ADD_TEST( quick_int_shift ),
-    ADD_TEST( quick_int_compare ),
+    ADD_TEST(quick_int_math),
+    ADD_TEST(quick_int_logic),
+    ADD_TEST(quick_int_shift),
+    ADD_TEST(quick_int_compare),
-    ADD_TEST( quick_uint_math ),
-    ADD_TEST( quick_uint_logic ),
-    ADD_TEST( quick_uint_shift ),
-    ADD_TEST( quick_uint_compare ),
+    ADD_TEST(quick_uint_math),
+    ADD_TEST(quick_uint_logic),
+    ADD_TEST(quick_uint_shift),
+    ADD_TEST(quick_uint_compare),
-    ADD_TEST( quick_short_math ),
-    ADD_TEST( quick_short_logic ),
-    ADD_TEST( quick_short_shift ),
-    ADD_TEST( quick_short_compare ),
+    ADD_TEST(quick_short_math),
+    ADD_TEST(quick_short_logic),
+    ADD_TEST(quick_short_shift),
+    ADD_TEST(quick_short_compare),
-    ADD_TEST( quick_ushort_math ),
-    ADD_TEST( quick_ushort_logic ),
-    ADD_TEST( quick_ushort_shift ),
-    ADD_TEST( quick_ushort_compare ),
+    ADD_TEST(quick_ushort_math),
+    ADD_TEST(quick_ushort_logic),
+    ADD_TEST(quick_ushort_shift),
+    ADD_TEST(quick_ushort_compare),
-    ADD_TEST( quick_char_math ),
-    ADD_TEST( quick_char_logic ),
-    ADD_TEST( quick_char_shift ),
-    ADD_TEST( quick_char_compare ),
+    ADD_TEST(quick_char_math),
+    ADD_TEST(quick_char_logic),
+    ADD_TEST(quick_char_shift),
+    ADD_TEST(quick_char_compare),
-    ADD_TEST( quick_uchar_math ),
-    ADD_TEST( quick_uchar_logic ),
-    ADD_TEST( quick_uchar_shift ),
-    ADD_TEST( quick_uchar_compare ),
+    ADD_TEST(quick_uchar_math),
+    ADD_TEST(quick_uchar_logic),
+    ADD_TEST(quick_uchar_shift),
+    ADD_TEST(quick_uchar_compare),
-    ADD_TEST( vector_scalar ),
+    ADD_TEST(vector_scalar),
+    ADD_TEST(integer_dot_product),
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 void fill_test_values( cl_long *outBufferA, cl_long *outBufferB, size_t numElements, MTdata d )
diff --git a/test_conformance/integer_ops/procs.h b/test_conformance/integer_ops/procs.h
index d5b77e7..82311fb 100644
--- a/test_conformance/integer_ops/procs.h
+++ b/test_conformance/integer_ops/procs.h
@@ -141,3 +141,5 @@
 extern int test_vector_scalar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_integer_dot_product(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements);
diff --git a/test_conformance/integer_ops/test_add_sat.cpp b/test_conformance/integer_ops/test_add_sat.cpp
index c0e45d1..e33f5c6 100644
--- a/test_conformance/integer_ops/test_add_sat.cpp
+++ b/test_conformance/integer_ops/test_add_sat.cpp
@@ -21,27 +21,18 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <algorithm>
 #include "procs.h"
-#define UCHAR_MIN   0
-#define USHRT_MIN   0
-#define UINT_MIN    0
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
 static int verify_addsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize )
     int i;
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, CL_CHAR_MIN );
-        r = MIN( r, CL_CHAR_MAX );
+        r = std::max(r, CL_CHAR_MIN);
+        r = std::min(r, CL_CHAR_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -55,9 +46,9 @@
     for( i = 0; i < n; i++ )
         cl_int r = (int) inA[i] + (int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_UCHAR_MAX );
-        if( r != outptr[i] )
+        r = std::max(r, 0);
+        r = std::min(r, CL_UCHAR_MAX);
+        if (r != outptr[i])
         { log_info( "\n%d) Failure for add_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
     return 0;
@@ -69,8 +60,8 @@
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, CL_SHRT_MIN );
-        r = MIN( r, CL_SHRT_MAX );
+        r = std::max(r, CL_SHRT_MIN);
+        r = std::min(r, CL_SHRT_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -84,8 +75,8 @@
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_USHRT_MAX );
+        r = std::max(r, 0);
+        r = std::min(r, CL_USHRT_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
diff --git a/test_conformance/integer_ops/test_integer_dot_product.cpp b/test_conformance/integer_ops/test_integer_dot_product.cpp
new file mode 100644
index 0000000..602d59b
--- /dev/null
+++ b/test_conformance/integer_ops/test_integer_dot_product.cpp
@@ -0,0 +1,442 @@
+// Copyright (c) 2021 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "procs.h"
+#include "harness/integer_ops_test_info.h"
+#include "harness/testHarness.h"
+template <size_t N, typename DstType, typename SrcTypeA, typename SrcTypeB>
+static void
+calculate_reference(std::vector<DstType>& ref, const std::vector<SrcTypeA>& a,
+                    const std::vector<SrcTypeB>& b, const bool AccSat = false,
+                    const std::vector<DstType>& acc = {})
+    assert(a.size() == b.size());
+    assert(AccSat == false || acc.size() == a.size() / N);
+    ref.resize(a.size() / N);
+    for (size_t r = 0; r < ref.size(); r++)
+    {
+        cl_long result = AccSat ? acc[r] : 0;
+        for (size_t c = 0; c < N; c++)
+        {
+            // OK to assume no overflow?
+            result += a[r * N + c] * b[r * N + c];
+        }
+        if (AccSat && result > std::numeric_limits<DstType>::max())
+        {
+            result = std::numeric_limits<DstType>::max();
+        }
+        ref[r] = static_cast<DstType>(result);
+    }
+template <typename SrcTypeA, typename SrcTypeB>
+void generate_inputs_with_special_values(std::vector<SrcTypeA>& a,
+                                         std::vector<SrcTypeB>& b)
+    const std::vector<SrcTypeA> specialValuesA(
+        { static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min()),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() + 1),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() / 2), 0,
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() / 2),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() - 1),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max()) });
+    const std::vector<SrcTypeB> specialValuesB(
+        { static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min()),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() + 1),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() / 2), 0,
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() / 2),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() - 1),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max()) });
+    size_t count = 0;
+    for (auto svA : specialValuesA)
+    {
+        for (auto svB : specialValuesB)
+        {
+            a[count] = svA;
+            b[count] = svB;
+            ++count;
+        }
+    }
+    // Generate random data for the rest of the inputs:
+    MTdataHolder d(gRandomSeed);
+    generate_random_data(TestInfo<SrcTypeA>::explicitType, a.size() - count, d,
+                + count);
+    generate_random_data(TestInfo<SrcTypeB>::explicitType, b.size() - count, d,
+                + count);
+template <typename SrcType>
+void generate_acc_sat_inputs(std::vector<SrcType>& acc)
+    // First generate random data:
+    fill_vector_with_random_data(acc);
+    // Now go through the generated data, and make every other element large.
+    // This ensures we have some elements that need saturation.
+    for (size_t i = 0; i < acc.size(); i += 2)
+    {
+        acc[i] = std::numeric_limits<SrcType>::max() - acc[i];
+    }
+template <typename T> struct PackedTestInfo
+    static constexpr const char* deviceTypeName = "UNSUPPORTED";
+template <> struct PackedTestInfo<cl_char>
+    static constexpr const char* deviceTypeName = "int";
+template <> struct PackedTestInfo<cl_uchar>
+    static constexpr const char* deviceTypeName = "uint";
+static constexpr const char* kernel_source_dot = R"CLC(
+__kernel void test_dot(__global DSTTYPE* dst, __global SRCTYPEA* a, __global SRCTYPEB* b)
+    int index = get_global_id(0);
+    dst[index] = DOT(a[index], b[index]);
+static constexpr const char* kernel_source_dot_acc_sat = R"CLC(
+__kernel void test_dot_acc_sat(
+    __global DSTTYPE* dst,
+    __global SRCTYPEA* a, __global SRCTYPEB* b, __global DSTTYPE* acc)
+    int index = get_global_id(0);
+    dst[index] = DOT_ACC_SAT(a[index], b[index], acc[index]);
+template <typename DstType, typename SrcTypeA, typename SrcTypeB, size_t N>
+static int test_case_dot(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements, bool packed,
+                         bool sat)
+    log_info("    testing %s = dot%s%s(%s, %s)\n",
+             std::numeric_limits<DstType>::is_signed ? "signed" : "unsigned",
+             sat ? "_acc_sat" : "", packed ? "_packed" : "",
+             std::numeric_limits<SrcTypeA>::is_signed ? "signed" : "unsigned",
+             std::numeric_limits<SrcTypeB>::is_signed ? "signed" : "unsigned");
+    cl_int error = CL_SUCCESS;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    std::string buildOptions;
+    buildOptions += " -DDSTTYPE=";
+    buildOptions += TestInfo<DstType>::deviceTypeName;
+    buildOptions += " -DSRCTYPEA=";
+    buildOptions += packed
+        ? PackedTestInfo<SrcTypeA>::deviceTypeName
+        : TestInfo<SrcTypeA>::deviceTypeName + std::to_string(N);
+    buildOptions += " -DSRCTYPEB=";
+    buildOptions += packed
+        ? PackedTestInfo<SrcTypeB>::deviceTypeName
+        : TestInfo<SrcTypeB>::deviceTypeName + std::to_string(N);
+    std::string packedSuffix;
+    packedSuffix += std::numeric_limits<SrcTypeA>::is_signed ? "s" : "u";
+    packedSuffix += std::numeric_limits<SrcTypeB>::is_signed ? "s" : "u";
+    packedSuffix += std::numeric_limits<DstType>::is_signed ? "_int" : "_uint";
+    if (sat)
+    {
+        buildOptions += packed
+            ? " -DDOT_ACC_SAT=dot_acc_sat_4x8packed_" + packedSuffix
+            : " -DDOT_ACC_SAT=dot_acc_sat";
+    }
+    else
+    {
+        buildOptions +=
+            packed ? " -DDOT=dot_4x8packed_" + packedSuffix : " -DDOT=dot";
+    }
+    std::vector<SrcTypeA> a(N * num_elements);
+    std::vector<SrcTypeB> b(N * num_elements);
+    generate_inputs_with_special_values(a, b);
+    std::vector<DstType> acc;
+    if (sat)
+    {
+        acc.resize(num_elements);
+        generate_acc_sat_inputs(acc);
+    }
+    std::vector<DstType> reference(num_elements);
+    calculate_reference<N>(reference, a, b, sat, acc);
+    const char* source = sat ? kernel_source_dot_acc_sat : kernel_source_dot;
+    const char* name = sat ? "test_dot_acc_sat" : "test_dot";
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &source,
+                                        name, buildOptions.c_str());
+    test_error(error, "Unable to create test kernel");
+    clMemWrapper dst = clCreateBuffer(
+        context, 0, reference.size() * sizeof(DstType), NULL, &error);
+    test_error(error, "Unable to create output buffer");
+    clMemWrapper srcA =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       a.size() * sizeof(SrcTypeA),, &error);
+    test_error(error, "Unable to create srcA buffer");
+    clMemWrapper srcB =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       b.size() * sizeof(SrcTypeB),, &error);
+    test_error(error, "Unable to create srcB buffer");
+    clMemWrapper srcAcc;
+    if (sat)
+    {
+        srcAcc =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           acc.size() * sizeof(DstType),, &error);
+        test_error(error, "Unable to create acc buffer");
+    }
+    error = clSetKernelArg(kernel, 0, sizeof(dst), &dst);
+    test_error(error, "Unable to set output buffer kernel arg");
+    error = clSetKernelArg(kernel, 1, sizeof(srcA), &srcA);
+    test_error(error, "Unable to set srcA buffer kernel arg");
+    error = clSetKernelArg(kernel, 2, sizeof(srcB), &srcB);
+    test_error(error, "Unable to set srcB buffer kernel arg");
+    if (sat)
+    {
+        error = clSetKernelArg(kernel, 3, sizeof(srcAcc), &srcAcc);
+        test_error(error, "Unable to set acc buffer kernel arg");
+    }
+    size_t global_work_size[] = { reference.size() };
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
+                                   NULL, 0, NULL, NULL);
+    test_error(error, "Unable to enqueue test kernel");
+    error = clFinish(queue);
+    test_error(error, "clFinish failed after test kernel");
+    std::vector<DstType> results(reference.size(), 99);
+    error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0,
+                                results.size() * sizeof(DstType),
+                      , 0, NULL, NULL);
+    test_error(error, "Unable to read data after test kernel");
+    if (results != reference)
+    {
+        log_error("Result buffer did not match reference buffer!\n");
+        return TEST_FAIL;
+    }
+    return TEST_PASS;
+template <typename SrcType, typename DstType, size_t N>
+static int test_vectype(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    int result = TEST_PASS;
+    typedef typename std::make_signed<SrcType>::type SSrcType;
+    typedef typename std::make_signed<DstType>::type SDstType;
+    typedef typename std::make_unsigned<SrcType>::type USrcType;
+    typedef typename std::make_unsigned<DstType>::type UDstType;
+    // dot testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    // dot_acc_sat testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    return result;
+template <typename SrcType, typename DstType, size_t N>
+static int test_vectype_packed(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
+    int result = TEST_PASS;
+    typedef typename std::make_signed<SrcType>::type SSrcType;
+    typedef typename std::make_signed<DstType>::type SDstType;
+    typedef typename std::make_unsigned<SrcType>::type USrcType;
+    typedef typename std::make_unsigned<DstType>::type UDstType;
+    // packed dot testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    // packed dot_acc_sat testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    return result;
+int test_integer_dot_product(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
+    if (!is_extension_available(deviceID, "cl_khr_integer_dot_product"))
+    {
+        log_info("cl_khr_integer_dot_product is not supported\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    Version deviceVersion = get_device_cl_version(deviceID);
+    cl_version extensionVersion;
+    if ((deviceVersion >= Version(3, 0))
+        || is_extension_available(deviceID, "cl_khr_extended_versioning"))
+    {
+        extensionVersion =
+            get_extension_version(deviceID, "cl_khr_integer_dot_product");
+    }
+    else
+    {
+        // Assume 1.0.0 is supported if the version can't be queried
+        extensionVersion = CL_MAKE_VERSION(1, 0, 0);
+    }
+    cl_int error = CL_SUCCESS;
+    int result = TEST_PASS;
+    cl_device_integer_dot_product_capabilities_khr dotCaps = 0;
+    error = clGetDeviceInfo(deviceID,
+                            sizeof(dotCaps), &dotCaps, NULL);
+    test_error(
+        error,
+    // Check that the required capabilities are reported
+    test_assert_error(
+        "When cl_khr_integer_dot_product is supported "
+        "supported");
+    if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
+    {
+        test_assert_error(
+            "When cl_khr_integer_dot_product is supported with version >= 2.0.0"
+            "supported");
+    }
+    // Check that acceleration properties can be queried
+    if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
+    {
+        size_t size_ret;
+        error = clGetDeviceInfo(
+            deviceID,
+            nullptr, &size_ret);
+        test_error(
+            error,
+            "Unable to query size of data returned by "
+        cl_device_integer_dot_product_acceleration_properties_khr
+            accelerationProperties;
+        error = clGetDeviceInfo(
+            deviceID,
+            sizeof(accelerationProperties), &accelerationProperties, nullptr);
+        test_error(error, "Unable to query 8-bit acceleration properties");
+        error = clGetDeviceInfo(
+            deviceID,
+            0, nullptr, &size_ret);
+        test_error(
+            error,
+            "Unable to query size of data returned by "
+            "PACKED_KHR");
+        error = clGetDeviceInfo(
+            deviceID,
+            sizeof(accelerationProperties), &accelerationProperties, nullptr);
+        test_error(error,
+                   "Unable to query 4x8-bit packed acceleration properties");
+    }
+    // Report when unknown capabilities are found
+    if (dotCaps
+    {
+        log_info("NOTE: found an unknown / untested capability!\n");
+    }
+    // Test built-in functions
+    {
+        result |= test_vectype<cl_uchar, cl_uint, 4>(deviceID, context, queue,
+                                                     num_elements);
+    }
+    {
+        result |= test_vectype_packed<cl_uchar, cl_uint, 4>(
+            deviceID, context, queue, num_elements);
+    }
+    return result;
diff --git a/test_conformance/integer_ops/test_integers.cpp b/test_conformance/integer_ops/test_integers.cpp
index 8d77b24..6fa18e1 100644
--- a/test_conformance/integer_ops/test_integers.cpp
+++ b/test_conformance/integer_ops/test_integers.cpp
@@ -16,14 +16,9 @@
 #include "testBase.h"
 #include "harness/conversions.h"
-#define TEST_SIZE 512
+#include <algorithm>
-#ifndef MIN
-    #define MIN( _a, _b )   ((_a) < (_b) ? (_a) : (_b))
-#ifndef MAX
-    #define MAX( _a, _b )   ((_a) > (_b) ? (_a) : (_b))
+#define TEST_SIZE 512
 const char *singleParamIntegerKernelSourcePattern =
 "__kernel void sample_test(__global %s *sourceA, __global %s *destValues)\n"
@@ -1512,19 +1507,20 @@
         switch( vecAType )
             case kULong:
-                ((cl_ulong*) destination)[0] = MAX(MIN(valueA, valueC), valueB);
+                ((cl_ulong *)destination)[0] =
+                    std::max(std::min(valueA, valueC), valueB);
             case kUInt:
-                ((cl_uint*) destination)[0] = (cl_uint)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_uint *)destination)[0] =
+                    (cl_uint)(std::max(std::min(valueA, valueC), valueB));
             case kUShort:
-                ((cl_ushort*) destination)[0] = (cl_ushort)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_ushort *)destination)[0] =
+                    (cl_ushort)(std::max(std::min(valueA, valueC), valueB));
             case kUChar:
-                ((cl_uchar*) destination)[0] = (cl_uchar)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_uchar *)destination)[0] =
+                    (cl_uchar)(std::max(std::min(valueA, valueC), valueB));
                 //error -- should never get here
@@ -1576,19 +1572,20 @@
         switch( vecAType )
             case kLong:
-                ((cl_long*) destination)[0] = MAX(MIN(valueA, valueC), valueB);
+                ((cl_long *)destination)[0] =
+                    std::max(std::min(valueA, valueC), valueB);
             case kInt:
-                ((cl_int*) destination)[0] = (cl_int)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_int *)destination)[0] =
+                    (cl_int)(std::max(std::min(valueA, valueC), valueB));
             case kShort:
-                ((cl_short*) destination)[0] = (cl_short)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_short *)destination)[0] =
+                    (cl_short)(std::max(std::min(valueA, valueC), valueB));
             case kChar:
-                ((cl_char*) destination)[0] = (cl_char)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_char *)destination)[0] =
+                    (cl_char)(std::max(std::min(valueA, valueC), valueB));
                 //error -- should never get here
@@ -1654,13 +1651,16 @@
                 ((cl_ulong*) destination)[0] = multLo;
             case kUInt:
-                ((cl_uint*) destination)[0] = (cl_uint) MIN( multLo, (cl_ulong) CL_UINT_MAX );
+                ((cl_uint *)destination)[0] =
+                    (cl_uint)std::min(multLo, (cl_ulong)CL_UINT_MAX);
             case kUShort:
-                ((cl_ushort*) destination)[0] = (cl_ushort) MIN( multLo, (cl_ulong) CL_USHRT_MAX );
+                ((cl_ushort *)destination)[0] =
+                    (cl_ushort)std::min(multLo, (cl_ulong)CL_USHRT_MAX);
             case kUChar:
-                ((cl_uchar*) destination)[0] = (cl_uchar) MIN( multLo, (cl_ulong) CL_UCHAR_MAX );
+                ((cl_uchar *)destination)[0] =
+                    (cl_uchar)std::min(multLo, (cl_ulong)CL_UCHAR_MAX);
                 //error -- should never get here
@@ -1744,18 +1744,18 @@
                 ((cl_long*) destination)[0] = result;
             case kInt:
-                result = MIN( result, (cl_long) CL_INT_MAX );
-                result = MAX( result, (cl_long) CL_INT_MIN );
+                result = std::min(result, (cl_long)CL_INT_MAX);
+                result = std::max(result, (cl_long)CL_INT_MIN);
                 ((cl_int*) destination)[0] = (cl_int) result;
             case kShort:
-                result = MIN( result, (cl_long) CL_SHRT_MAX );
-                result = MAX( result, (cl_long) CL_SHRT_MIN );
+                result = std::min(result, (cl_long)CL_SHRT_MAX);
+                result = std::max(result, (cl_long)CL_SHRT_MIN);
                 ((cl_short*) destination)[0] = (cl_short) result;
             case kChar:
-                result = MIN( result, (cl_long) CL_CHAR_MAX );
-                result = MAX( result, (cl_long) CL_CHAR_MIN );
+                result = std::min(result, (cl_long)CL_CHAR_MAX);
+                result = std::max(result, (cl_long)CL_CHAR_MIN);
                 ((cl_char*) destination)[0] = (cl_char) result;
diff --git a/test_conformance/integer_ops/test_sub_sat.cpp b/test_conformance/integer_ops/test_sub_sat.cpp
index 845d106..2a88ee0 100644
--- a/test_conformance/integer_ops/test_sub_sat.cpp
+++ b/test_conformance/integer_ops/test_sub_sat.cpp
@@ -21,28 +21,18 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <algorithm>
 #include "procs.h"
-#define UCHAR_MIN   0
-#define USHRT_MIN   0
-#define UINT_MIN    0
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
 static int verify_subsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize )
     int i;
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, CL_CHAR_MIN );
-        r = MIN( r, CL_CHAR_MAX );
+        r = std::max(r, CL_CHAR_MIN);
+        r = std::min(r, CL_CHAR_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -56,9 +46,9 @@
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_UCHAR_MAX );
-        if( r != outptr[i] )
+        r = std::max(r, 0);
+        r = std::min(r, CL_UCHAR_MAX);
+        if (r != outptr[i])
         { log_info( "\n%d) Failure for sub_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
     return 0;
@@ -70,8 +60,8 @@
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, CL_SHRT_MIN );
-        r = MIN( r, CL_SHRT_MAX );
+        r = std::max(r, CL_SHRT_MIN);
+        r = std::min(r, CL_SHRT_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -85,8 +75,8 @@
     for( i = 0; i < n; i++ )
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_USHRT_MAX );
+        r = std::max(r, 0);
+        r = std::min(r, CL_USHRT_MAX);
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
diff --git a/test_conformance/integer_ops/test_unary_ops.cpp b/test_conformance/integer_ops/test_unary_ops.cpp
index 72940ea..c91c85a 100644
--- a/test_conformance/integer_ops/test_unary_ops.cpp
+++ b/test_conformance/integer_ops/test_unary_ops.cpp
@@ -107,7 +107,7 @@
             // For sub ops, the min control value is 2. Otherwise, it's 0
             controlData[ i ] |= 0x02;
         else if( whichOp == kIncrement )
-            // For addition ops, the MAX control value is 1. Otherwise, it's 3
+            // For addition ops, the max control value is 1. Otherwise, it's 3
             controlData[ i ] &= ~0x02;
     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index d8dfc40..3281402 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -9,7 +9,10 @@
+    common.cpp
+    common.h
+    function_list.h
@@ -20,9 +23,12 @@
+    reference_math.h
+    sleep.h
+    test_functions.h
@@ -32,6 +38,11 @@
+    utility.h
+# math_brute_force compiles cleanly with -Wall (except for a few remaining
+# warnings), but other tests not (yet); so enable -Wall locally.
+set_gnulike_module_compile_flags("-Wall -Wno-strict-aliasing -Wno-unknown-pragmas")
diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 4baa499..f18d0b9 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -14,16 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
+namespace {
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -109,49 +112,49 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     double maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -164,10 +167,10 @@
     int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
@@ -277,204 +280,20 @@
-static size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = 0;
-    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
@@ -577,7 +396,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -659,7 +479,7 @@
                 float err = Bruteforce_Ulp_Error_Double(test, correct);
                 int fail = !(fabsf(err) <= ulps);
-                if (fail && ftz)
+                if (fail && (ftz || relaxedMode))
                     // retry per section
                     if (IsDoubleResultSubnormal(correct, ulps))
@@ -810,7 +630,7 @@
         if (gVerboseBruteForce)
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  base, job->step, job->scale, buffer_elements, job->ulps,
@@ -825,3 +645,152 @@
     return error;
+} // anonymous namespace
+int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = 0;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 32caafa..fe1491d 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -14,16 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
+namespace {
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -107,49 +110,49 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     double maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -162,10 +165,10 @@
     int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
@@ -267,209 +270,23 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
-    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
     cl_int error;
-    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    std::vector<bool> overflow(buffer_elements, false);
     const char *name = job->f->name;
     int isFDim = job->isFDim;
     int skipNanInf = job->skipNanInf;
@@ -583,7 +400,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -631,7 +449,6 @@
             vlog_error("Error: clFinish failed! err: %d\n", error);
             goto exit;
-        free(overflow);
         return CL_SUCCESS;
@@ -641,7 +458,7 @@
         // Calculate the correctly rounded reference result
         memset(&oldMode, 0, sizeof(oldMode));
-        if (ftz) ForceFTZ(&oldMode);
+        if (ftz || relaxedMode) ForceFTZ(&oldMode);
         // Set the rounding mode to match the device
         if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
@@ -726,7 +543,7 @@
                     float err = Ulp_Error(test, correct);
                     int fail = !(fabsf(err) <= ulps);
-                    if (fail && ftz)
+                    if (fail && (ftz || relaxedMode))
                         // retry per section
                         if (IsFloatResultSubnormal(correct, ulps))
@@ -938,7 +755,7 @@
                             "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
-                            "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n",
+                            "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %zu\n",
                             name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
                             s2[j], ((cl_uint *)s2)[j], r[j], test,
                             ((cl_uint *)&test)[0], j);
@@ -970,7 +787,7 @@
         if (gVerboseBruteForce)
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  base, job->step, job->scale, buffer_elements, job->ulps,
@@ -983,6 +800,154 @@
-    if (overflow) free(overflow);
+    return error;
+} // anonymous namespace
+int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
     return error;
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 69e620a..f8786e6 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -21,8 +22,10 @@
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,61 +111,63 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     float ulps; // max_allowed ulps
     int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
     // no special values
-} TestInfo;
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
@@ -272,210 +277,28 @@
-static size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static const int specialValuesInt[] = {
+const int specialValuesInt[] = {
     0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
     INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
-static constexpr size_t specialValuesIntCount =
+constexpr size_t specialValuesIntCount =
     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
@@ -576,7 +399,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -658,7 +482,7 @@
                 float err = Bruteforce_Ulp_Error_Double(test, correct);
                 int fail = !(fabsf(err) <= ulps);
-                if (fail && ftz)
+                if (fail && (ftz || relaxedMode))
                     // retry per section
                     if (IsDoubleResultSubnormal(correct, ulps))
@@ -744,3 +568,151 @@
     return error;
+} // anonymous namespace
+int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index e65a9aa..d855f44 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -21,8 +22,10 @@
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -106,61 +109,62 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     float ulps; // max_allowed ulps
     int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
     // no special values
-} TestInfo;
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
@@ -262,212 +266,29 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static const int specialValuesInt[] = {
+const int specialValuesInt[] = {
     0,           1,           2,           3,          126,        127,
     128,         0x02000001,  0x04000001,  1465264071, 1488522147, -1,
     -2,          -3,          -126,        -127,       -128,       -0x02000001,
     -0x04000001, -1465264071, -1488522147,
-static size_t specialValuesIntCount =
+constexpr size_t specialValuesIntCount =
     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     float ulps = job->ulps;
     MTdata d = tinfo->d;
     cl_int error;
@@ -568,7 +389,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -650,7 +472,7 @@
                 float err = Ulp_Error(test, correct);
                 int fail = !(fabsf(err) <= ulps);
-                if (fail && ftz)
+                if (fail && (ftz || relaxedMode))
                     // retry per section
                     if (IsFloatResultSubnormal(correct, ulps))
@@ -694,7 +516,7 @@
                         "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
-                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
+                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %zu\n",
                         name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
                         s2[j], r[j], ((uint32_t *)r)[j], test,
                         ((cl_uint *)&test)[0], j);
@@ -723,7 +545,7 @@
         if (gVerboseBruteForce)
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  base, job->step, job->scale, buffer_elements, job->ulps,
@@ -738,3 +560,152 @@
     return error;
+} // anonymous namespace
+int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 21e76c8..bbe5c43 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -14,15 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *operator_symbol, int vectorSize,
-                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                       bool relaxedMode)
+namespace {
+int BuildKernel(const char *operator_symbol, int vectorSize,
+                cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,49 +111,49 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *operator_symbol;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->operator_symbol, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     double maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -161,10 +164,10 @@
                       // otherwise.
     // no special fields
-} TestInfo;
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
@@ -274,201 +277,20 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
-                                           bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
@@ -569,7 +391,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -651,7 +474,7 @@
                 float err = Bruteforce_Ulp_Error_Double(test, correct);
                 int fail = !(fabsf(err) <= ulps);
-                if (fail && ftz)
+                if (fail && (ftz || relaxedMode))
                     // retry per section
                     if (IsDoubleResultSubnormal(correct, ulps))
@@ -778,7 +601,7 @@
         if (gVerboseBruteForce)
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  base, job->step, job->scale, buffer_elements, job->ulps,
@@ -793,3 +616,148 @@
     return error;
+} // anonymous namespace
+int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
+                                           bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index ccaef60..1a28d8d 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -14,15 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *operator_symbol, int vectorSize,
-                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                       bool relaxedMode)
+namespace {
+int BuildKernel(const char *operator_symbol, int vectorSize,
+                cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -106,49 +109,49 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *operator_symbol;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->operator_symbol, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     double maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -159,10 +162,10 @@
                       // otherwise.
     // no special fields
-} TestInfo;
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
@@ -264,207 +267,23 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
-                                        bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
     cl_int error;
-    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    std::vector<bool> overflow(buffer_elements, false);
     const char *name = job->f->name;
     cl_uint *t = 0;
     cl_float *r = 0;
@@ -584,7 +403,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -627,14 +447,13 @@
     if (gSkipCorrectnessTesting)
-        free(overflow);
         return CL_SUCCESS;
     // Calculate the correctly rounded reference result
     FPU_mode_type oldMode;
     memset(&oldMode, 0, sizeof(oldMode));
-    if (ftz) ForceFTZ(&oldMode);
+    if (ftz || relaxedMode) ForceFTZ(&oldMode);
     // Set the rounding mode to match the device
     oldRoundMode = kRoundToNearestEven;
@@ -662,7 +481,7 @@
     if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
-    if (ftz) RestoreFPState(&oldMode);
+    if (ftz || relaxedMode) RestoreFPState(&oldMode);
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
@@ -719,7 +538,7 @@
                     ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
                 if (fabsf(errB) < fabsf(err)) err = errB;
-                if (fail && ftz)
+                if (fail && (ftz || relaxedMode))
                     // retry per section
                     if (IsFloatResultSubnormal(correct, ulps))
@@ -879,7 +698,7 @@
                 if (fail)
                     vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
-                               "vs. %a (0x%8.8x) at index: %d\n",
+                               "vs. %a (0x%8.8x) at index: %zu\n",
                                name, sizeNames[k], err, s[j], s2[j], r[j], test,
                                ((cl_uint *)&test)[0], j);
                     error = -1;
@@ -907,7 +726,7 @@
         if (gVerboseBruteForce)
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  base, job->step, job->scale, buffer_elements, job->ulps,
@@ -920,6 +739,152 @@
-    if (overflow) free(overflow);
+    return error;
+} // anonymous namespace
+int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
+                                        bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
     return error;
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 14f4109..bbfd707 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -14,15 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -115,24 +119,23 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
-typedef struct ComputeReferenceInfoD_
+struct ComputeReferenceInfoD
     const double *x;
     const double *y;
@@ -141,9 +144,9 @@
     long double (*f_ffpI)(long double, long double, int *);
     cl_uint lim;
     cl_uint count;
-} ComputeReferenceInfoD;
-static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -165,10 +168,12 @@
     return CL_SUCCESS;
+} // anonymous namespace
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
@@ -187,8 +192,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -375,7 +380,7 @@
                 if (iptrUndefined) iErr = 0;
                 int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
-                if (ftz && fail)
+                if ((ftz || relaxedMode) && fail)
                     // retry per section
                     if (IsDoubleResultSubnormal(correct, f->double_ulps))
@@ -523,17 +528,20 @@
                 if (fail)
-                    vlog_error(
-                        "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
-                        "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
-                        "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
-                        "0x%16.16llx, 0x%8.8x})\n",
-                        f->name, sizeNames[k], err, iErr, ((double *)gIn)[j],
-                        ((double *)gIn2)[j], ((cl_ulong *)gIn)[j],
-                        ((cl_ulong *)gIn2)[j], ((double *)gOut_Ref)[j],
-                        ((int *)gOut_Ref2)[j], ((cl_ulong *)gOut_Ref)[j],
-                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                        ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
+                    vlog_error("\nERROR: %sD%s: {%f, %" PRId64
+                               "} ulp error at {%.13la, "
+                               "%.13la} ({ 0x%16.16" PRIx64 ", 0x%16.16" PRIx64
+                               "}): *{%.13la, "
+                               "%d} ({ 0x%16.16" PRIx64
+                               ", 0x%8.8x}) vs. {%.13la, %d} ({ "
+                               "0x%16.16" PRIx64 ", 0x%8.8x})\n",
+                               f->name, sizeNames[k], err, iErr,
+                               ((double *)gIn)[j], ((double *)gIn2)[j],
+                               ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j],
+                               ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
+                               ((cl_ulong *)gOut_Ref)[j],
+                               ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                               ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
                     error = -1;
                     goto exit;
@@ -544,8 +552,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -562,8 +571,8 @@
-        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
-             maxErrorVal2);
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
@@ -573,7 +582,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 5ef44b6..0747337 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -14,15 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -113,24 +117,23 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
-typedef struct ComputeReferenceInfoF_
+struct ComputeReferenceInfoF
     const float *x;
     const float *y;
@@ -139,9 +142,9 @@
     double (*f_ffpI)(double, double, int *);
     cl_uint lim;
     cl_uint count;
-} ComputeReferenceInfoF;
-static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -161,13 +164,15 @@
     return CL_SUCCESS;
+} // anonymous namespace
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -188,8 +193,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -375,7 +380,7 @@
                 if (iptrUndefined) iErr = 0;
                 int fail = !(fabsf(err) <= float_ulps && iErr == 0);
-                if (ftz && fail)
+                if ((ftz || relaxedMode) && fail)
                     // retry per section
                     if (IsFloatResultSubnormal(correct, float_ulps))
@@ -509,16 +514,17 @@
                 if (fail)
-                    vlog_error(
-                        "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
-                        "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
-                        "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
-                        f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
-                        ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
-                        ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
-                        ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
-                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                        ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
+                    vlog_error("\nERROR: %s%s: {%f, %" PRId64
+                               "} ulp error at {%a, %a} "
+                               "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
+                               "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
+                               f->name, sizeNames[k], err, iErr,
+                               ((float *)gIn)[j], ((float *)gIn2)[j],
+                               ((cl_uint *)gIn)[j], ((cl_uint *)gIn2)[j],
+                               ((float *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
+                               ((cl_uint *)gOut_Ref)[j],
+                               ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                               ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
                     error = -1;
                     goto exit;
@@ -529,8 +535,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -547,8 +554,8 @@
-        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
-             maxErrorVal2);
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
@@ -558,7 +565,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp
new file mode 100644
index 0000000..f5e9f99
--- /dev/null
+++ b/test_conformance/math_brute_force/common.cpp
@@ -0,0 +1,170 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "common.h"
+#include "utility.h" // for sizeNames and sizeValues.
+#include <sstream>
+#include <string>
+namespace {
+const char *GetTypeName(ParameterType type)
+    switch (type)
+    {
+        case ParameterType::Float: return "float";
+        case ParameterType::Double: return "double";
+    }
+    return nullptr;
+const char *GetUndefValue(ParameterType type)
+    switch (type)
+    {
+        case ParameterType::Float:
+        case ParameterType::Double: return "NAN";
+    }
+    return nullptr;
+void EmitDefineType(std::ostringstream &kernel, const char *name,
+                    ParameterType type, int vector_size_index)
+    kernel << "#define " << name << " " << GetTypeName(type)
+           << sizeNames[vector_size_index] << '\n';
+    kernel << "#define " << name << "_SCALAR " << GetTypeName(type) << '\n';
+void EmitDefineUndef(std::ostringstream &kernel, const char *name,
+                     ParameterType type)
+    kernel << "#define " << name << " " << GetUndefValue(type) << '\n';
+void EmitEnableExtension(std::ostringstream &kernel, ParameterType type)
+    switch (type)
+    {
+        case ParameterType::Double:
+            kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+            break;
+        case ParameterType::Float:
+            // No extension required.
+            break;
+    }
+} // anonymous namespace
+std::string GetKernelName(int vector_size_index)
+    return std::string("math_kernel") + sizeNames[vector_size_index];
+std::string GetTernaryKernel(const std::string &kernel_name,
+                             const char *builtin, ParameterType retType,
+                             ParameterType type1, ParameterType type2,
+                             ParameterType type3, int vector_size_index)
+    // To keep the kernel code readable, use macros for types and undef values.
+    std::ostringstream kernel;
+    EmitDefineType(kernel, "RETTYPE", retType, vector_size_index);
+    EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
+    EmitDefineType(kernel, "TYPE2", type2, vector_size_index);
+    EmitDefineType(kernel, "TYPE3", type3, vector_size_index);
+    EmitDefineUndef(kernel, "UNDEF1", type1);
+    EmitDefineUndef(kernel, "UNDEF2", type2);
+    EmitDefineUndef(kernel, "UNDEF3", type3);
+    EmitEnableExtension(kernel, type1);
+    // clang-format off
+    const char *kernel_nonvec3[] = { R"(
+__kernel void )", kernel_name.c_str(), R"((__global RETTYPE* out,
+                          __global TYPE1* in1,
+                          __global TYPE2* in2,
+                          __global TYPE3* in3)
+    size_t i = get_global_id(0);
+    out[i] = )", builtin, R"((in1[i], in2[i], in3[i]);
+)" };
+    const char *kernel_vec3[] = { R"(
+__kernel void )", kernel_name.c_str(), R"((__global RETTYPE_SCALAR* out,
+                          __global TYPE1_SCALAR* in1,
+                          __global TYPE2_SCALAR* in2,
+                          __global TYPE3_SCALAR* in3)
+    size_t i = get_global_id(0);
+    if (i + 1 < get_global_size(0))
+    {
+        TYPE1 a = vload3(0, in1 + 3 * i);
+        TYPE2 b = vload3(0, in2 + 3 * i);
+        TYPE3 c = vload3(0, in3 + 3 * i);
+        RETTYPE res = )", builtin, R"((a, b, c);
+        vstore3(res, 0, out + 3 * i);
+    }
+    else
+    {
+        // Figure out how many elements are left over after
+        // BUFFER_SIZE % (3 * sizeof(type)).
+        // Assume power of two buffer size.
+        size_t parity = i & 1;
+        TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
+        TYPE2 b = (TYPE2)(UNDEF2, UNDEF2, UNDEF2);
+        TYPE3 c = (TYPE3)(UNDEF3, UNDEF3, UNDEF3);
+        switch (parity)
+        {
+            case 0:
+                a.y = in1[3 * i + 1];
+                b.y = in2[3 * i + 1];
+                c.y = in3[3 * i + 1];
+                // fall through
+            case 1:
+                a.x = in1[3 * i];
+                b.x = in2[3 * i];
+                c.x = in3[3 * i];
+                break;
+        }
+        RETTYPE res = )", builtin, R"((a, b, c);
+        switch (parity)
+        {
+            case 0:
+                out[3 * i + 1] = res.y;
+                // fall through
+            case 1:
+                out[3 * i] = res.x;
+                break;
+        }
+    }
+)" };
+    // clang-format on
+    if (sizeValues[vector_size_index] != 3)
+        for (const auto &chunk : kernel_nonvec3) kernel << chunk;
+    else
+        for (const auto &chunk : kernel_vec3) kernel << chunk;
+    return kernel.str();
diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h
new file mode 100644
index 0000000..143814c
--- /dev/null
+++ b/test_conformance/math_brute_force/common.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef COMMON_H
+#define COMMON_H
+#include "harness/typeWrappers.h"
+#include "utility.h"
+#include <array>
+#include <string>
+#include <vector>
+// Array of thread-specific kernels for each vector size.
+using KernelMatrix = std::array<std::vector<cl_kernel>, VECTOR_SIZE_COUNT>;
+// Array of programs for each vector size.
+using Programs = std::array<clProgramWrapper, VECTOR_SIZE_COUNT>;
+// Array of buffers for each vector size.
+using Buffers = std::array<clMemWrapper, VECTOR_SIZE_COUNT>;
+// Types supported for kernel code generation.
+enum class ParameterType
+    Float,
+    Double,
+// Return kernel name suffixed with vector size.
+std::string GetKernelName(int vector_size_index);
+// Generate kernel code for the given builtin function/operator.
+std::string GetTernaryKernel(const std::string &kernel_name,
+                             const char *builtin, ParameterType retType,
+                             ParameterType type1, ParameterType type2,
+                             ParameterType type3, int vector_size_index);
+// Information to generate OpenCL kernels.
+struct BuildKernelInfo
+    // Number of kernels to build, one for each thread to avoid data races.
+    cl_uint threadCount;
+    KernelMatrix &kernels;
+    Programs &programs;
+    // Function, macro or symbol tested by the kernel.
+    const char *nameInCode;
+    // Whether to build with -cl-fast-relaxed-math.
+    bool relaxedMode;
+#endif /* COMMON_H */
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 3edbb48..9173628 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -53,6 +53,7 @@
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
 #define unaryF NULL
 #define i_unaryF NULL
 #define unaryF_u NULL
diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h
index 38f739c..95a2945 100644
--- a/test_conformance/math_brute_force/function_list.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -30,7 +30,7 @@
 #include "harness/mt19937.h"
-typedef union fptr {
+union fptr {
     void *p;
     double (*f_f)(double);
     double (*f_u)(cl_uint);
@@ -45,9 +45,9 @@
     double (*f_ffpI)(double, double, int *);
     double (*f_fff)(double, double, double);
     float (*f_fma)(float, float, float, int);
-} fptr;
-typedef union dptr {
+union dptr {
     void *p;
     long double (*f_f)(long double);
     long double (*f_u)(cl_ulong);
@@ -59,20 +59,20 @@
     long double (*f_fpI)(long double, int *);
     long double (*f_ffpI)(long double, long double, int *);
     long double (*f_fff)(long double, long double, long double);
-} dptr;
 struct Func;
-typedef struct vtbl
+struct vtbl
     const char *type_name;
     int (*TestFunc)(const struct Func *, MTdata, bool);
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
-} vtbl;
-typedef struct Func
+struct Func
     const char *name; // common name, to be used as an argument in the shell
     const char *nameInCode; // name as it appears in the __kernel, usually the
@@ -88,7 +88,7 @@
     int ftz;
     int relaxed;
     const vtbl *vtbl_ptr;
-} Func;
 extern const Func functionList[];
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 4383fa8..0cbcf86 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -100,27 +104,28 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ;
     uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
@@ -138,8 +143,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -244,7 +249,7 @@
                 // If we aren't getting the correctly rounded result
                 if (t[j] != q[j])
-                    if (ftz && IsDoubleSubnormal(s[j]))
+                    if ((ftz || relaxedMode) && IsDoubleSubnormal(s[j]))
                         unsigned int correct0 = f->dfunc.i_f(0.0);
                         unsigned int correct1 = f->dfunc.i_f(-0.0);
@@ -267,8 +272,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -295,7 +301,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index c803aa3..90bb1e1 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -98,27 +102,28 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
@@ -135,8 +140,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -241,7 +246,7 @@
                 // If we aren't getting the correctly rounded result
                 if (t[j] != q[j])
-                    if (ftz && IsFloatSubnormal(s[j]))
+                    if ((ftz || relaxedMode) && IsFloatSubnormal(s[j]))
                         unsigned int correct0 = f->func.i_f(0.0);
                         unsigned int correct1 = f->func.i_f(-0.0);
@@ -264,8 +269,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -291,7 +297,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index d09915f..412f210 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -107,54 +111,55 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
-typedef struct TestInfo
+    MTdataHolder d;
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
-} TestInfo;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
@@ -264,182 +269,19 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (size_t i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
@@ -538,7 +380,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -613,7 +456,7 @@
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
             // If we aren't getting the correctly rounded result
-            if (ftz)
+            if (ftz || relaxedMode)
                 if (IsDoubleSubnormal(s[j]))
@@ -645,8 +488,9 @@
             cl_ulong err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld "
-                       "vs. %lld  (index: %d)\n",
+            vlog_error("\nERROR: %s: %" PRId64
+                       " ulp error at {%.13la, %.13la}: *%" PRId64 " "
+                       "vs. %" PRId64 "  (index: %zu)\n",
                        name, err, ((double *)s)[j], ((double *)s2)[j], t[j],
                        q[j], j);
             error = -1;
@@ -654,13 +498,14 @@
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
             q = (cl_long *)out[k];
             // If we aren't getting the correctly rounded result
             if (-t[j] != q[j])
-                if (ftz)
+                if (ftz || relaxedMode)
                     if (IsDoubleSubnormal(s[j]))
@@ -692,8 +537,9 @@
                 cl_ulong err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, "
-                           "%.13la}: *%lld vs. %lld  (index: %d)\n",
+                vlog_error("\nERROR: %sD%s: %" PRId64 " ulp error at {%.13la, "
+                           "%.13la}: *%" PRId64 " vs. %" PRId64
+                           "  (index: %zu)\n",
                            name, sizeNames[k], err, ((double *)s)[j],
                            ((double *)s2)[j], -t[j], q[j], j);
                 error = -1;
@@ -735,3 +581,131 @@
     return error;
+} // anonymous namespace
+int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index c530cda..cb915fc 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -14,14 +14,17 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -105,54 +108,55 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
-typedef struct TestInfo
+    MTdataHolder d;
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
-} TestInfo;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
@@ -254,183 +258,19 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
@@ -531,7 +371,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             goto exit;
@@ -604,7 +445,7 @@
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
-            if (ftz)
+            if (ftz || relaxedMode)
                 if (IsFloatSubnormal(s[j]))
@@ -637,20 +478,21 @@
             uint32_t err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
             vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
-                       "0x%8.8x (index: %d)\n",
+                       "0x%8.8x (index: %zu)\n",
                        name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
             error = -1;
             goto exit;
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
             q = out[k];
             // If we aren't getting the correctly rounded result
             if (-t[j] != q[j])
-                if (ftz)
+                if (ftz || relaxedMode)
                     if (IsFloatSubnormal(s[j]))
@@ -682,7 +524,7 @@
                 cl_uint err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
                 vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
-                           "vs. 0x%8.8x (index: %d)\n",
+                           "vs. 0x%8.8x (index: %zu)\n",
                            name, sizeNames[k], err, ((float *)s)[j],
                            ((float *)s2)[j], -t[j], q[j], j);
                 error = -1;
@@ -724,3 +566,132 @@
     return error;
+} // anonymous namespace
+int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 00e65a2..c2e7cdc 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -101,210 +105,61 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    Buffers outBuf;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
-} TestInfo;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     cl_int error;
     const char *name = job->f->name;
@@ -362,7 +217,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
@@ -430,7 +286,7 @@
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
             // If we aren't getting the correctly rounded result
-            if (ftz)
+            if (ftz || relaxedMode)
                 if (IsDoubleSubnormal(s[j]))
@@ -442,19 +298,21 @@
             cl_ulong err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
+            vlog_error("\nERROR: %sD: %" PRId64
+                       " ulp error at %.13la: *%" PRId64 " vs. %" PRId64 "\n",
                        name, err, ((double *)gIn)[j], t[j], q[j]);
             return -1;
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
             q = out[k];
             // If we aren't getting the correctly rounded result
             if (-t[j] != q[j])
-                if (ftz)
+                if (ftz || relaxedMode)
                     if (IsDoubleSubnormal(s[j]))
@@ -467,7 +325,8 @@
                 cl_ulong err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
-                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                    "\nERROR: %sD%s: %" PRId64 " ulp error at %.13la: *%" PRId64
+                    " vs. %" PRId64 "\n",
                     name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
                 return -1;
@@ -506,3 +365,119 @@
     return CL_SUCCESS;
+} // anonymous namespace
+int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 3c1717a..6a1b9b9 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -14,14 +14,17 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -100,211 +103,61 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    Buffers outBuf;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
-} TestInfo;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
     cl_int error = CL_SUCCESS;
     cl_int ret = CL_SUCCESS;
     const char *name = job->f->name;
@@ -365,7 +218,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
@@ -435,7 +289,7 @@
             if (gMinVectorSizeIndex == 0 && t[j] != q[j])
                 // If we aren't getting the correctly rounded result
-                if (ftz)
+                if (ftz || relaxedMode)
                     if (IsFloatSubnormal(s[j]))
@@ -454,14 +308,14 @@
-            for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
-                 k++)
+            for (auto k = std::max(1U, gMinVectorSizeIndex);
+                 k < gMaxVectorSizeIndex; k++)
                 q = out[k];
                 // If we aren't getting the correctly rounded result
                 if (-t[j] != q[j])
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         if (IsFloatSubnormal(s[j]))
@@ -521,3 +375,120 @@
     return ret;
+} // anonymous namespace
+int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index a32cd5a..8d8fec5 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -14,126 +14,49 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2,  __global double",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2, "
-        "__global double* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       double3 d2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+    auto kernel_name = GetKernelName(vectorSize);
+    auto source = GetTernaryKernel(kernel_name, name, ParameterType::Double,
+                                   ParameterType::Double, ParameterType::Double,
+                                   ParameterType::Double, vectorSize);
+    std::array<const char *, 1> sources{ source.c_str() };
+    return MakeKernel(, sources.size(), kernel_name.c_str(), k, p,
+                      relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     double maxErrorVal = 0.0f;
@@ -145,8 +68,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -294,7 +217,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index 095a22f..04ac5aa 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -14,127 +14,52 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2,  __global float",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2, "
-        "__global float* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       float3 f2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+    auto kernel_name = GetKernelName(vectorSize);
+    auto source = GetTernaryKernel(kernel_name, name, ParameterType::Float,
+                                   ParameterType::Float, ParameterType::Float,
+                                   ParameterType::Float, vectorSize);
+    std::array<const char *, 1> sources{ source.c_str() };
+    return MakeKernel(, sources.size(), kernel_name.c_str(), k, p,
+                      relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     float maxErrorVal = 0.0f;
@@ -144,8 +69,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -293,7 +218,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index d6c2f11..64491bd 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -18,6 +18,7 @@
 #include "sleep.h"
 #include "utility.h"
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
@@ -57,8 +58,8 @@
 cl_device_id gDevice = NULL;
 cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
-static int32_t gStartTestNumber = -1;
-static int32_t gEndTestNumber = -1;
+static size_t gStartTestNumber = ~0u;
+static size_t gEndTestNumber = ~0u;
 int gSkipCorrectnessTesting = 0;
 static int gStopOnError = 0;
 static bool gSkipRestOfTests;
@@ -97,7 +98,7 @@
 cl_mem gInBuffer3 = NULL;
-static MTdata gMTdata;
+static MTdataHolder gMTdata;
 cl_device_fp_config gFloatCapabilities = 0;
 int gWimpyReductionFactor = 32;
 int gVerboseBruteForce = 0;
@@ -128,10 +129,10 @@
         const Func *const temp_func = functionList + i;
         if (strcmp(temp_func->name, name) == 0)
-            if ((gStartTestNumber != -1 && i < gStartTestNumber)
+            if ((gStartTestNumber != ~0u && i < gStartTestNumber)
                 || i > gEndTestNumber)
-                vlog("Skipping function #%d\n", i);
+                vlog("Skipping function #%zu\n", i);
                 return 0;
@@ -167,7 +168,6 @@
-        extern int my_ilogb(double);
         if (0 == strcmp("ilogb", func_data->name))
@@ -326,7 +326,7 @@
-    gMTdata = init_genrand(gRandomSeed);
+    gMTdata = MTdataHolder(gRandomSeed);
     FPU_mode_type oldMode;
@@ -336,8 +336,6 @@
-    free_mtdata(gMTdata);
     if (gQueue)
         int error_code = clFinish(gQueue);
@@ -360,16 +358,18 @@
     int singleThreaded = 0;
     { // Extract the app name
-        strncpy(appName, argv[0], MAXPATHLEN);
+        strncpy(appName, argv[0], MAXPATHLEN - 1);
+        appName[MAXPATHLEN - 1] = '\0';
 #if defined(__APPLE__)
         char baseName[MAXPATHLEN];
         char *base = NULL;
-        strncpy(baseName, argv[0], MAXPATHLEN);
+        strncpy(baseName, argv[0], MAXPATHLEN - 1);
+        baseName[MAXPATHLEN - 1] = '\0';
         base = basename(baseName);
         if (NULL != base)
-            strncpy(appName, base, sizeof(appName));
+            strncpy(appName, base, sizeof(appName) - 1);
             appName[sizeof(appName) - 1] = '\0';
@@ -467,7 +467,7 @@
             long number = strtol(arg, &t, 0);
             if (t != arg)
-                if (-1 == gStartTestNumber)
+                if (~0u == gStartTestNumber)
                     gStartTestNumber = (int32_t)number;
                     gEndTestNumber = gStartTestNumber + (int32_t)number;
@@ -502,8 +502,6 @@
         gWimpyMode = 1;
-    vlog("\nTest binary built %s %s\n", __DATE__, __TIME__);
     if (gWimpyMode)
@@ -524,7 +522,7 @@
 static void PrintFunctions(void)
     vlog("\nMath function names:\n");
-    for (int i = 0; i < functionListCount; i++)
+    for (size_t i = 0; i < functionListCount; i++)
         vlog("\t%s\n", functionList[i].name);
@@ -1056,8 +1054,6 @@
                 cl_uint kernel_count, cl_kernel *k, cl_program *p,
                 bool relaxedMode)
-    int error = 0;
-    cl_uint i;
     char options[200] = "";
     if (gForceFTZ)
@@ -1075,7 +1071,7 @@
         strcat(options, " -cl-fast-relaxed-math");
-    error =
+    int error =
         create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
     if (error != CL_SUCCESS)
@@ -1083,9 +1079,7 @@
         return error;
-    memset(k, 0, kernel_count * sizeof(*k));
-    for (i = 0; i < kernel_count; i++)
+    for (cl_uint i = 0; i < kernel_count; i++)
         k[i] = clCreateKernel(*p, name, &error);
         if (NULL == k[i] || error)
@@ -1096,7 +1090,6 @@
             clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG,
                                   sizeof(buffer), buffer, NULL);
             vlog_error("Log: %s\n", buffer);
-            clReleaseProgram(*p);
             return error;
@@ -1244,7 +1237,7 @@
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
+            DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1);
         // Scale the exponent of the error
         float result = (float)scalbnl(testVal - reference, ulp_exp);
@@ -1260,7 +1253,7 @@
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
     int ulp_exp =
-        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
+        DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
     // allow correctly rounded results to pass through unmolested. (We might add
     // error to it below.) There is something of a performance optimization here
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 3a6516b..afa072f 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -41,10 +41,10 @@
 static void __log2_ep(double *hi, double *lo, double x);
-typedef union {
+union uint64d_t {
     uint64_t i;
     double d;
-} uint64d_t;
 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
@@ -1949,7 +1949,8 @@
         w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
     static const double zero = 0.00000000000000000000e+00;
-    double t, y, z, nadj, p, p1, p2, p3, q, r, w;
+    double nadj = zero;
+    double t, y, z, p, p1, p2, p3, q, r, w;
     cl_int i, hx, lx, ix;
     union {
@@ -2259,10 +2260,10 @@
     return dx / dy;
-typedef struct
+struct double_double
     double hi, lo;
-} double_double;
 // Split doubles_double into a series of consecutive 26-bit precise doubles and
 // a remainder. Note for later -- for multiplication, it might be better to
@@ -2321,7 +2322,7 @@
 static inline double_double add_dd(double_double a, double_double b)
-    double_double r = { -0.0 - 0.0 };
+    double_double r = { -0.0, -0.0 };
     if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
         || 0.0 == b.hi)
@@ -3767,10 +3768,10 @@
 static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
                                   0x26263314, 0x1701b839, 0x28948127 };
-typedef union {
+union d_ui64_t {
     uint64_t u;
     double d;
-} d_ui64_t;
 // radix or base of representation
 #define RADIX (30)
@@ -3786,13 +3787,13 @@
 // extended fixed point representation of double precision
 // floating point number.
 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
-typedef struct
+struct eprep_t
     uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
                    // base_30
     int index; // exponent bias
     int sign; // sign of double
-} eprep_t;
 static eprep_t double_to_eprep(double x)
@@ -4549,8 +4550,8 @@
     if (x != x || y != y) return x + y;
     // do the work required to sort out edge cases
-    double fabsy = reference_fabs(y);
-    double fabsx = reference_fabs(x);
+    double fabsy = (double)reference_fabsl(y);
+    double fabsx = (double)reference_fabsl(x);
     double iy = reference_rint(
         fabsy); // we do round to nearest here so that |fy| <= 0.5
     if (iy > fabsy) // convert nearbyint to floor
@@ -4637,13 +4638,13 @@
     // compute product of y*log2(x)
     // scale to avoid overflow in double-double multiplication
-    if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    if (fabsy > HEX_DBL(+, 1, 0, +, 970))
         y_hi = reference_ldexp(y_hi, -53);
         y_lo = reference_ldexp(y_lo, -53);
     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
-    if (fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    if (fabsy > HEX_DBL(+, 1, 0, +, 970))
         ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
         ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
@@ -5357,10 +5358,10 @@
         0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
     }; // first 126 bits of pi
-    long double head, tail, temp;
+    long double head, tail;
 #if __LDBL_MANT_DIG__ >= 64
     // long double has 64-bits of precision or greater
-    temp = (long double)pi_bits[0] * 0x1.0p64L;
+    long double temp = (long double)pi_bits[0] * 0x1.0p64L;
     head = temp + (long double)pi_bits[1];
     temp -= head; // rounding err rounding pi_bits[1] into head
     tail = (long double)pi_bits[1] + temp;
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 606fdc5..b5f1ab0 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -14,127 +14,49 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
 #define FLUSHED 1
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2,  __global double",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2, "
-        "__global double* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       double3 d2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+    auto kernel_name = GetKernelName(vectorSize);
+    auto source = GetTernaryKernel(kernel_name, name, ParameterType::Double,
+                                   ParameterType::Double, ParameterType::Double,
+                                   ParameterType::Double, vectorSize);
+    std::array<const char *, 1> sources{ source.c_str() };
+    return MakeKernel(, sources.size(), kernel_name.c_str(), k, p,
+                      relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
@@ -202,14 +124,16 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
+} // anonymous namespace
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                                          bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
@@ -224,8 +148,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -387,7 +311,7 @@
                     float err = Bruteforce_Ulp_Error_Double(test, correct);
                     int fail = !(fabsf(err) <= f->double_ulps);
-                    if (fail && ftz)
+                    if (fail && (ftz || relaxedMode))
                         // retry per section
                         if (IsDoubleSubnormal(correct))
@@ -704,8 +628,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -733,7 +658,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index e52c0a0..cf36184 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -14,125 +14,49 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
 #define FLUSHED 1
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2,  __global float",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2, "
-        "__global float* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       float3 f2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+    auto kernel_name = GetKernelName(vectorSize);
+    auto source = GetTernaryKernel(kernel_name, name, ParameterType::Float,
+                                   ParameterType::Float, ParameterType::Float,
+                                   ParameterType::Float, vectorSize);
+    std::array<const char *, 1> sources{ source.c_str() };
+    return MakeKernel(, sources.size(), kernel_name.c_str(), k, p,
+                      relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
@@ -210,16 +134,18 @@
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
+} // anonymous namespace
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -240,8 +166,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -439,7 +365,7 @@
                     err = Ulp_Error(test, correct);
                     fail = !(fabsf(err) <= float_ulps);
-                    if (fail && ftz)
+                    if (fail && (ftz || relaxedMode))
                         float correct2, err2;
@@ -839,8 +765,8 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -868,7 +794,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index f6fa326..177cfe5 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -101,44 +105,44 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
     double maxErrorValue; // position of the max error value.  Init to 0.
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -151,185 +155,21 @@
     float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     cl_int error;
     int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
@@ -385,7 +225,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
@@ -463,7 +304,7 @@
                 if (fail)
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         // retry per section
                         if (IsDoubleResultSubnormal(correct, ulps))
@@ -505,7 +346,7 @@
                 if (fail)
                     vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
-                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               "(0x%16.16" PRIx64 "): *%.13la vs. %.13la\n",
                                job->f->name, sizeNames[k], err,
                                ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
                                ((cl_double *)gOut_Ref)[j], test);
@@ -547,3 +388,133 @@
     return CL_SUCCESS;
+} // anonymous namespace
+int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 17edc58..4c1f1a1 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -14,14 +14,17 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -99,44 +102,44 @@
-typedef struct BuildKernelInfo
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
+                       info->kernels[vectorSize].data(),
+                       &(info->programs[vectorSize]), info->relaxedMode);
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    Buffers outBuf;
     float maxError; // max error value. Init to 0.
     double maxErrorValue; // position of the max error value.  Init to 0.
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-typedef struct TestInfo
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+    // Programs for various vector sizes.
+    Programs programs;
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -149,207 +152,16 @@
     float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-    // Check for special cases for unary float
-    test_info.isRangeLimited = 0;
-    test_info.half_sin_cos_tan_limit = 0;
-    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f
-            + test_info.ulps
-                * (FLT_EPSILON / 2.0f); // out of range results from finite
-                                        // inputs must be in [-1,1]
-    }
-    else if (0 == strcmp(f->name, "half_tan"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit =
-            INFINITY; // out of range resut from finite inputs must be numeric
-    }
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-    // Run the kernels
-    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-        if (error) goto exit;
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-        if (skipTestingRelaxed)
-        {
-            vlog(" (rlx skip correctness testing)\n");
-            goto exit;
-        }
-        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    }
-    vlog("\n");
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-        free(test_info.tinfo);
-    }
-    return error;
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     const char *fname = job->f->name;
     bool relaxedMode = job->relaxedMode;
@@ -440,7 +252,8 @@
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
@@ -619,7 +432,7 @@
                 if (fail)
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         typedef int (*CheckForSubnormal)(
                             double, float); // If we are in fast relaxed math,
@@ -725,3 +538,159 @@
     return CL_SUCCESS;
+} // anonymous namespace
+int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        test_info.k[i].resize(test_info.threadCount, nullptr);
+    }
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode,
+                                    relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+        if (error) goto exit;
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+        if (skipTestingRelaxed)
+        {
+            vlog(" (rlx skip correctness testing)\n");
+            goto exit;
+        }
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    }
+    vlog("\n");
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        for (auto &kernel : test_info.k[i])
+        {
+            clReleaseKernel(kernel);
+        }
+    }
+    return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 71dd4f4..6d7c61d 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -107,27 +111,28 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
@@ -144,8 +149,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -287,7 +292,7 @@
                     float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
                     int fail = !(fabsf(err) <= f->double_ulps
                                  && fabsf(err2) <= f->double_ulps);
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         // retry per section
                         if (IsDoubleResultSubnormal(correct, f->double_ulps))
@@ -410,8 +415,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -439,7 +445,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 4a375ce..42e858c 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -105,27 +109,28 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
@@ -143,8 +148,8 @@
     float float_ulps = getAllowedUlpError(f, relaxedMode);
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -254,7 +259,7 @@
             // Calculate the correctly rounded reference result
             memset(&oldMode, 0, sizeof(oldMode));
-            if (ftz) ForceFTZ(&oldMode);
+            if (ftz || relaxedMode) ForceFTZ(&oldMode);
             // Set the rounding mode to match the device
             if (gIsInRTZMode)
@@ -381,7 +386,7 @@
                     int fail = !(fabsf(err) <= float_ulps
                                  && fabsf(err2) <= float_ulps);
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         // retry per section
                         if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
@@ -542,8 +547,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -571,7 +577,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 14d1fb9..8b75194 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -14,15 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,33 +112,34 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
-static cl_ulong abs_cl_long(cl_long i)
+cl_ulong abs_cl_long(cl_long i)
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
+} // anonymous namespace
 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
@@ -152,8 +157,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -290,7 +295,7 @@
                     cl_long iErr = (long long)q2[j] - (long long)correct2;
                     int fail = !(fabsf(err) <= f->double_ulps
                                  && abs_cl_long(iErr) <= maxiError);
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         // retry per section
                         if (IsDoubleResultSubnormal(correct, f->double_ulps))
@@ -382,8 +387,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -400,8 +406,8 @@
-        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
-             maxErrorVal2);
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
@@ -411,7 +417,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 23b0d70..54843a2 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -14,15 +14,19 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <climits>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -106,33 +110,34 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
-static cl_ulong abs_cl_long(cl_long i)
+cl_ulong abs_cl_long(cl_long i)
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
+} // anonymous namespace
 int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
@@ -155,8 +160,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -293,7 +298,7 @@
                     cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
                     int fail = !(fabsf(err) <= float_ulps
                                  && abs_cl_long(iErr) <= maxiError);
-                    if (ftz)
+                    if (ftz || relaxedMode)
                         // retry per section
                         if (IsFloatResultSubnormal(correct, float_ulps))
@@ -380,8 +385,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -398,8 +404,8 @@
-        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
-             maxErrorVal2);
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
@@ -409,7 +415,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index 3c5f99d..9b60904 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -102,32 +106,33 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
-static cl_ulong random64(MTdata d)
+cl_ulong random64(MTdata d)
     return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
+} // anonymous namespace
 int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
@@ -140,8 +145,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -245,7 +250,7 @@
                     if (fail)
-                        if (ftz)
+                        if (ftz || relaxedMode)
                             // retry per section
                             if (IsDoubleResultSubnormal(correct,
@@ -263,11 +268,11 @@
                     if (fail)
-                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
-                                   "*%.13la vs. %.13la\n",
-                                   f->name, sizeNames[k], err,
-                                   ((uint64_t *)gIn)[j],
-                                   ((double *)gOut_Ref)[j], test);
+                        vlog_error(
+                            "\n%s%sD: %f ulp error at 0x%16.16" PRIx64 ": "
+                            "*%.13la vs. %.13la\n",
+                            f->name, sizeNames[k], err, ((uint64_t *)gIn)[j],
+                            ((double *)gOut_Ref)[j], test);
                         error = -1;
                         goto exit;
@@ -279,8 +284,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -307,7 +313,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 44c5af4..b67a9bd 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -14,14 +14,18 @@
 // limitations under the License.
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
+#include <cinttypes>
 #include <cstring>
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
     const char *c[] = { "__kernel void math_kernel",
@@ -99,27 +103,28 @@
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-typedef struct BuildKernelInfo
+struct BuildKernelInfo2
-    cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
-    cl_program *programs;
+    Programs &programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
+    BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
+    cl_uint vectorSize = gMinVectorSizeIndex + job_id;
+    return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
+                       &(info->programs[vectorSize]), info->relaxedMode);
+} // anonymous namespace
 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
+    Programs programs;
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -137,8 +142,8 @@
     // Init the kernels
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
+        BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
+                                     relaxedMode };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
@@ -249,7 +254,7 @@
                     if (fail)
-                        if (ftz)
+                        if (ftz || relaxedMode)
                             // retry per section
                             if (IsFloatResultSubnormal(correct, float_ulps))
@@ -281,8 +286,9 @@
             if (gVerboseBruteForce)
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     BUFFER_SIZE);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
@@ -309,7 +315,6 @@
     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        clReleaseProgram(programs[k]);
     return error;
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index ac4db9c..b4a59ed 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -90,8 +90,7 @@
                 bool relaxedMode);
 // used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32(uint32_t bits);
-static inline double DoubleFromUInt32(uint32_t bits)
+inline double DoubleFromUInt32(uint32_t bits)
     union {
         uint64_t u;
@@ -117,25 +116,25 @@
 // premature flushing to zero.
 // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
 // ulp_limit to be flushed to zero.
-static inline int IsFloatResultSubnormal(double x, float ulps)
+inline int IsFloatResultSubnormal(double x, float ulps)
     x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
-static inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
+inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
     x = x - abs_err;
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
-static inline int IsDoubleResultSubnormal(long double x, float ulps)
+inline int IsDoubleResultSubnormal(long double x, float ulps)
     x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps;
     return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022);
-static inline int IsFloatInfinity(double x)
+inline int IsFloatInfinity(double x)
     union {
         cl_float d;
@@ -145,7 +144,7 @@
     return ((u.u & 0x7fffffffU) == 0x7F800000U);
-static inline int IsFloatMaxFloat(double x)
+inline int IsFloatMaxFloat(double x)
     union {
         cl_float d;
@@ -155,7 +154,7 @@
     return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
-static inline int IsFloatNaN(double x)
+inline int IsFloatNaN(double x)
     union {
         cl_float d;
@@ -165,13 +164,13 @@
     return ((u.u & 0x7fffffffU) > 0x7F800000U);
-extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
 // convert long and ulong to float and double or otherwise deal with values
 // that need more precision than 53-bit. So, set the x87 to 64-bit precision.
-static inline void Force64BitFPUPrecision(void)
+inline void Force64BitFPUPrecision(void)
 #if __MINGW32__
     // The usual method is to use _controlfp as follows:
@@ -202,17 +201,17 @@
-extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
-typedef union {
+union int32f_t {
     int32_t i;
     float f;
-} int32f_t;
-typedef union {
+union int64d_t {
     int64_t l;
     double d;
-} int64d_t;
 void MulD(double *rhi, double *rlo, double u, double v);
 void AddD(double *rhi, double *rlo, double a, double b);
@@ -229,7 +228,7 @@
 float getAllowedUlpError(const Func *f, const bool relaxed);
-static inline cl_uint getTestScale(size_t typeSize)
+inline cl_uint getTestScale(size_t typeSize)
     if (gWimpyMode)
@@ -245,7 +244,7 @@
-static inline uint64_t getTestStep(size_t typeSize, size_t bufferSize)
+inline uint64_t getTestStep(size_t typeSize, size_t bufferSize)
     if (gWimpyMode)
diff --git a/test_conformance/multiple_device_context/test_multiple_devices.cpp b/test_conformance/multiple_device_context/test_multiple_devices.cpp
index 59543ad..4f187b9 100644
--- a/test_conformance/multiple_device_context/test_multiple_devices.cpp
+++ b/test_conformance/multiple_device_context/test_multiple_devices.cpp
@@ -175,9 +175,8 @@
     /* All done now! */
-  if (errors)
-    return -1;
-    return 0;
+  if (errors) return -1;
+  return 0;
 int test_two_devices(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
diff --git a/test_conformance/pipes/test_pipe_info.cpp b/test_conformance/pipes/test_pipe_info.cpp
index 7543c6c..e7b486d 100644
--- a/test_conformance/pipes/test_pipe_info.cpp
+++ b/test_conformance/pipes/test_pipe_info.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 #include "procs.h"
+#include "harness/parseParameters.h"
 const char* pipe_kernel_code = {
     "__kernel void pipe_kernel(__write_only pipe int out_pipe)\n"
@@ -39,8 +40,7 @@
     if (pipe_width != returnVal)
-        log_error("Error in clGetPipeInfo() check of pipe packet size\n");
-        return -1;
+        test_fail("Error in clGetPipeInfo() check of pipe packet size\n");
@@ -52,29 +52,37 @@
     if(pipe_depth != returnVal)
-        log_error( "Error in clGetPipeInfo() check of pipe max packets\n" );
-        return -1;
+        test_fail("Error in clGetPipeInfo() check of pipe max packets\n");
         log_info( " CL_PIPE_MAX_PACKETS passed.\n" );
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, (const char**)&pipe_kernel_code, "pipe_kernel", "-cl-std=CL2.0 -cl-kernel-arg-info");
-    test_error_ret(err, " Error creating program", -1);
+    err = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel, 1, &pipe_kernel_code, "pipe_kernel",
+        "-cl-std=CL2.0 -cl-kernel-arg-info");
+    test_error_fail(err, "Error creating program");
     cl_kernel_arg_type_qualifier arg_type_qualifier = 0;
-    cl_kernel_arg_type_qualifier expected_type_qualifier = CL_KERNEL_ARG_TYPE_PIPE;
-    err = clGetKernelArgInfo( kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER, sizeof(arg_type_qualifier), &arg_type_qualifier, NULL );
-    test_error_ret(err, " clSetKernelArgInfo failed", -1);
-    err = (arg_type_qualifier != expected_type_qualifier);
-    if(err)
+    err = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                             sizeof(arg_type_qualifier), &arg_type_qualifier,
+                             NULL);
+    if (gCompilationMode == kOnline)
-        print_error(err, "ERROR: Bad type qualifier\n");
-        return -1;
+        test_error_fail(err, "clGetKernelArgInfo failed");
+        if (arg_type_qualifier != CL_KERNEL_ARG_TYPE_PIPE)
+        {
+            test_fail("ERROR: Incorrect type qualifier: %i\n",
+                      arg_type_qualifier);
+        }
+    }
+    else
+    {
+        test_failure_error_ret(err, CL_KERNEL_ARG_INFO_NOT_AVAILABLE,
+                               "clGetKernelArgInfo error not as expected",
+                               TEST_FAIL);
-    return err;
+    return TEST_PASS;
diff --git a/test_conformance/pipes/test_pipe_limits.cpp b/test_conformance/pipes/test_pipe_limits.cpp
index 169ab80..e1048f5 100644
--- a/test_conformance/pipes/test_pipe_limits.cpp
+++ b/test_conformance/pipes/test_pipe_limits.cpp
@@ -69,7 +69,7 @@
-          // clang-format om
+        // clang-format on
     stream << R"(
@@ -163,7 +163,7 @@
     cl_int err;
     cl_int size;
     int num_pipe_elements = 1024;
-    int i, j;
+    int i;
     int max_pipe_args;
     std::stringstream source;
     clEventWrapper producer_sync_event = NULL;
@@ -648,4 +648,4 @@
     return 0;
\ No newline at end of file
diff --git a/test_conformance/pipes/test_pipe_read_write.cpp b/test_conformance/pipes/test_pipe_read_write.cpp
index dd0d121..425c7ae 100644
--- a/test_conformance/pipes/test_pipe_read_write.cpp
+++ b/test_conformance/pipes/test_pipe_read_write.cpp
@@ -414,9 +414,9 @@
 static int verify_readwrite_double(void *ptr1, void *ptr2, int n)
     int                i;
-    long long int    sum_input = 0, sum_output = 0;
-    long long int    *inptr = (long long int *)ptr1;
-    long long int    *outptr = (long long int *)ptr2;
+    cl_long sum_input = 0, sum_output = 0;
+    cl_long *inptr = (cl_long *)ptr1;
+    cl_long *outptr = (cl_long *)ptr2;
     for(i = 0; i < n; i++)
@@ -626,7 +626,6 @@
     size_t size = sizeof(TestStruct);
     size_t global_work_size[3];
     cl_int err;
-    int total_errors = 0;
     int i;
     MTdataHolder d(gRandomSeed);
     clEventWrapper producer_sync_event = NULL;
@@ -1076,7 +1075,8 @@
     if(!is_extension_available(deviceID, "cl_khr_fp16"))
-        log_info("cl_khr_fp16 is not supported on this platoform. Skipping test.\n");
+        log_info(
+            "cl_khr_fp16 is not supported on this platform. Skipping test.\n");
         return CL_SUCCESS;
     ptrSizes[0] = sizeof(cl_float) / 2;
@@ -1246,7 +1246,7 @@
     size_t  min_alignment = get_min_alignment(context);
-    foo = verify_readwrite_long;
+    foo = verify_readwrite_double;
     ptrSizes[0] = sizeof(cl_double);
     ptrSizes[1] = ptrSizes[0] << 1;
@@ -1257,7 +1257,8 @@
     //skip devices that don't support double
     if(!is_extension_available(deviceID, "cl_khr_fp64"))
-        log_info("cl_khr_fp64 is not supported on this platoform. Skipping test.\n");
+        log_info(
+            "cl_khr_fp64 is not supported on this platform. Skipping test.\n");
         return CL_SUCCESS;
@@ -1404,7 +1405,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_int(deviceID, context, queue, num_elements);
@@ -1418,7 +1420,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_uint(deviceID, context, queue, num_elements);
@@ -1432,7 +1435,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_short(deviceID, context, queue, num_elements);
@@ -1446,7 +1450,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_ushort(deviceID, context, queue, num_elements);
@@ -1460,7 +1465,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_char(deviceID, context, queue, num_elements);
@@ -1474,7 +1480,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_uchar(deviceID, context, queue, num_elements);
@@ -1489,7 +1496,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_float(deviceID, context, queue, num_elements);
@@ -1503,7 +1511,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_half(deviceID, context, queue, num_elements);
@@ -1517,7 +1526,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_long(deviceID, context, queue, num_elements);
@@ -1531,7 +1541,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_ulong(deviceID, context, queue, num_elements);
@@ -1545,7 +1556,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     return test_pipe_readwrite_double(deviceID, context, queue, num_elements);
@@ -1555,7 +1567,8 @@
     if(!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
     const char *kernelNames[] = {"test_pipe_subgroup_write_struct","test_pipe_subgroup_read_struct"};
diff --git a/test_conformance/pipes/test_pipe_subgroups.cpp b/test_conformance/pipes/test_pipe_subgroups.cpp
index b3e1718..8e2f6e5 100644
--- a/test_conformance/pipes/test_pipe_subgroups.cpp
+++ b/test_conformance/pipes/test_pipe_subgroups.cpp
@@ -114,9 +114,8 @@
     if (!is_extension_available(deviceID, "cl_khr_subgroups"))
-        log_info(
-            "cl_khr_subgroups is not supported on this platoform. Skipping "
-            "test.\n");
+        log_info("cl_khr_subgroups is not supported on this platform. Skipping "
+                 "test.\n");
         return CL_SUCCESS;
diff --git a/test_conformance/printf/test_printf.cpp b/test_conformance/printf/test_printf.cpp
index 2b804e4..d638cd4 100644
--- a/test_conformance/printf/test_printf.cpp
+++ b/test_conformance/printf/test_printf.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -232,10 +232,8 @@
 static cl_program makePrintfProgram(cl_kernel *kernel_ptr, const cl_context context,const unsigned int testId,const unsigned int testNum,bool isLongSupport,bool is64bAddrSpace)
-    int err,i;
+    int err;
     cl_program program;
-    cl_device_id devID;
-    char buildLog[ 1024 * 128 ];
     char testname[256] = {0};
     char addrSpaceArgument[256] = {0};
     char addrSpacePAddArgument[256] = {0};
@@ -825,73 +823,75 @@
     return doTest(gQueue, gContext, TYPE_ADDRESS_SPACE, 4, deviceID);
+int test_buffer_size(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
+    size_t printf_buff_size = 0;
+    const size_t printf_buff_size_req = !gIsEmbedded ? (1024 * 1024UL) : 1024UL;
+    const size_t config_size = sizeof(printf_buff_size);
+    cl_int err = CL_SUCCESS;
+    err = clGetDeviceInfo(deviceID, CL_DEVICE_PRINTF_BUFFER_SIZE, config_size,
+                          &printf_buff_size, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("Unable to query CL_DEVICE_PRINTF_BUFFER_SIZE");
+        return TEST_FAIL;
+    }
+    if (printf_buff_size < printf_buff_size_req)
+    {
+        log_error("CL_DEVICE_PRINTF_BUFFER_SIZE does not meet requirements");
+        return TEST_FAIL;
+    }
+    return TEST_PASS;
 test_definition test_list[] = {
-    ADD_TEST( int_0 ),
-    ADD_TEST( int_1 ),
-    ADD_TEST( int_2 ),
-    ADD_TEST( int_3 ),
-    ADD_TEST( int_4 ),
-    ADD_TEST( int_5 ),
-    ADD_TEST( int_6 ),
-    ADD_TEST( int_7 ),
-    ADD_TEST( int_8 ),
+    ADD_TEST(int_0),           ADD_TEST(int_1),
+    ADD_TEST(int_2),           ADD_TEST(int_3),
+    ADD_TEST(int_4),           ADD_TEST(int_5),
+    ADD_TEST(int_6),           ADD_TEST(int_7),
+    ADD_TEST(int_8),
-    ADD_TEST( float_0 ),
-    ADD_TEST( float_1 ),
-    ADD_TEST( float_2 ),
-    ADD_TEST( float_3 ),
-    ADD_TEST( float_4 ),
-    ADD_TEST( float_5 ),
-    ADD_TEST( float_6 ),
-    ADD_TEST( float_7 ),
-    ADD_TEST( float_8 ),
-    ADD_TEST( float_9 ),
-    ADD_TEST( float_10 ),
-    ADD_TEST( float_11 ),
-    ADD_TEST( float_12 ),
-    ADD_TEST( float_13 ),
-    ADD_TEST( float_14 ),
-    ADD_TEST( float_15 ),
-    ADD_TEST( float_16 ),
-    ADD_TEST( float_17 ),
+    ADD_TEST(float_0),         ADD_TEST(float_1),
+    ADD_TEST(float_2),         ADD_TEST(float_3),
+    ADD_TEST(float_4),         ADD_TEST(float_5),
+    ADD_TEST(float_6),         ADD_TEST(float_7),
+    ADD_TEST(float_8),         ADD_TEST(float_9),
+    ADD_TEST(float_10),        ADD_TEST(float_11),
+    ADD_TEST(float_12),        ADD_TEST(float_13),
+    ADD_TEST(float_14),        ADD_TEST(float_15),
+    ADD_TEST(float_16),        ADD_TEST(float_17),
-    ADD_TEST( float_limits_0 ),
-    ADD_TEST( float_limits_1 ),
-    ADD_TEST( float_limits_2 ),
+    ADD_TEST(float_limits_0),  ADD_TEST(float_limits_1),
+    ADD_TEST(float_limits_2),
-    ADD_TEST( octal_0 ),
-    ADD_TEST( octal_1 ),
-    ADD_TEST( octal_2 ),
-    ADD_TEST( octal_3 ),
+    ADD_TEST(octal_0),         ADD_TEST(octal_1),
+    ADD_TEST(octal_2),         ADD_TEST(octal_3),
-    ADD_TEST( unsigned_0 ),
-    ADD_TEST( unsigned_1 ),
+    ADD_TEST(unsigned_0),      ADD_TEST(unsigned_1),
-    ADD_TEST( hexadecimal_0 ),
-    ADD_TEST( hexadecimal_1 ),
-    ADD_TEST( hexadecimal_2 ),
-    ADD_TEST( hexadecimal_3 ),
-    ADD_TEST( hexadecimal_4 ),
+    ADD_TEST(hexadecimal_0),   ADD_TEST(hexadecimal_1),
+    ADD_TEST(hexadecimal_2),   ADD_TEST(hexadecimal_3),
+    ADD_TEST(hexadecimal_4),
-    ADD_TEST( char_0 ),
-    ADD_TEST( char_1 ),
-    ADD_TEST( char_2 ),
+    ADD_TEST(char_0),          ADD_TEST(char_1),
+    ADD_TEST(char_2),
-    ADD_TEST( string_0 ),
-    ADD_TEST( string_1 ),
-    ADD_TEST( string_2 ),
+    ADD_TEST(string_0),        ADD_TEST(string_1),
+    ADD_TEST(string_2),
-    ADD_TEST( vector_0 ),
-    ADD_TEST( vector_1 ),
-    ADD_TEST( vector_2 ),
-    ADD_TEST( vector_3 ),
-    ADD_TEST( vector_4 ),
+    ADD_TEST(vector_0),        ADD_TEST(vector_1),
+    ADD_TEST(vector_2),        ADD_TEST(vector_3),
+    ADD_TEST(vector_4),
-    ADD_TEST( address_space_0 ),
-    ADD_TEST( address_space_1 ),
-    ADD_TEST( address_space_2 ),
-    ADD_TEST( address_space_3 ),
-    ADD_TEST( address_space_4 ),
+    ADD_TEST(address_space_0), ADD_TEST(address_space_1),
+    ADD_TEST(address_space_2), ADD_TEST(address_space_3),
+    ADD_TEST(address_space_4),
+    ADD_TEST(buffer_size),
 const int test_num = ARRAY_SIZE( test_list );
@@ -1030,8 +1030,6 @@
         return TEST_SKIP;
-    log_info( "Test binary built %s %s\n", __DATE__, __TIME__ );
     gFd = acquireOutputStream(&err);
     if (err != 0)
diff --git a/test_conformance/printf/util_printf.cpp b/test_conformance/printf/util_printf.cpp
index 3546c5f..d45e1d4 100644
--- a/test_conformance/printf/util_printf.cpp
+++ b/test_conformance/printf/util_printf.cpp
@@ -842,8 +842,6 @@
 void generateRef(const cl_device_id device)
-    int fd = -1;
-    char _refBuffer[ANALYSIS_BUFFER_SIZE];
     const cl_device_fp_config fpConfig = get_default_rounding_mode(device);
     const RoundingMode hostRound = get_round();
     RoundingMode deviceRound;
diff --git a/test_conformance/profiling/execute.cpp b/test_conformance/profiling/execute.cpp
index edfc043..44b1bcd 100644
--- a/test_conformance/profiling/execute.cpp
+++ b/test_conformance/profiling/execute.cpp
@@ -21,6 +21,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <algorithm>
 #include "procs.h"
 #include "harness/testHarness.h"
 #include "harness/errorHelpers.h"
@@ -29,12 +31,6 @@
 typedef unsigned char uchar;
-#undef MIN
-#define MIN(x,y)    ( (x) < (y) ? (x) : (y) )
-#undef MAX
-#define MAX(x,y)    ( (x) > (y) ? (x) : (y) )
 //#define CREATE_OUTPUT    1
 extern int writePPM( const char *filename, uchar *buf, int xsize, int ysize );
@@ -73,8 +69,8 @@
 static void read_imagef( int x, int y, int w, int h, int nChannels, uchar *src, float *srcRgb )
     // clamp the coords
-    int    x0 = MIN( MAX( x, 0 ), w - 1 );
-    int    y0 = MIN( MAX( y, 0 ), h - 1 );
+    int x0 = std::min(std::max(x, 0), w - 1);
+    int y0 = std::min(std::max(y, 0), h - 1);
     // get tine index
     int    indx = ( y0 * w + x0 ) * nChannels;
@@ -339,8 +335,8 @@
     clReleaseMemObject( memobjs[1] );
     clReleaseMemObject( memobjs[0] );
-  if (check_times(queueStart, submitStart, writeStart, writeEnd, device))
-    err = -1;
+    if (check_times(queueStart, submitStart, writeStart, writeEnd, device))
+        err = -1;
     return err;
diff --git a/test_conformance/profiling/writeImage.cpp b/test_conformance/profiling/writeImage.cpp
index fbc8fbc..ec2fbda 100644
--- a/test_conformance/profiling/writeImage.cpp
+++ b/test_conformance/profiling/writeImage.cpp
@@ -628,8 +628,8 @@
     free( dst );
     free( inptr );
-  if (check_times(queueStart, submitStart, writeStart, writeEnd, device))
-    err = -1;
+    if (check_times(queueStart, submitStart, writeStart, writeEnd, device))
+        err = -1;
     return err;
diff --git a/test_conformance/ b/test_conformance/
index ea7f677..974491e 100755
--- a/test_conformance/
+++ b/test_conformance/
@@ -8,295 +8,304 @@
-import os, re, sys, subprocess, time, commands, tempfile, math, string
+from __future__ import print_function
+import os
+import re
+import sys
+import subprocess
+import time
+import tempfile
 DEBUG = 0
-log_file_name = "opencl_conformance_results_" + time.strftime("%Y-%m-%d_%H-%M", time.localtime())+ ".log"
+log_file_name = "opencl_conformance_results_" + time.strftime("%Y-%m-%d_%H-%M", time.localtime()) + ".log"
 process_pid = 0
 # The amount of time between printing a "." (if no output from test) or ":" (if output)
 #  to the screen while the tests are running.
-seconds_between_status_updates = 60*60*24*7  # effectively never
+seconds_between_status_updates = 60 * 60 * 24 * 7  # effectively never
 # Help info
-def write_help_info() :
- print(" test_list [CL_DEVICE_TYPE(s) to test] [partial-test-names, ...] [log=path/to/log/file/]")
- print(" test_list - the .csv file containing the test names and commands to run the tests.")
- print(" [partial-test-names, ...] - optional partial strings to select a subset of the tests to run.")
- print(" [CL_DEVICE_TYPE(s) to test] - list of CL device types to test, default is CL_DEVICE_TYPE_DEFAULT.")
- print(" [log=path/to/log/file/] - provide a path for the test log file, default is in the current directory.")
- print("   (Note: spaces are not allowed in the log file path.")
+def write_help_info():
+    print(" test_list [CL_DEVICE_TYPE(s) to test] [partial-test-names, ...] [log=path/to/log/file/]")
+    print(" test_list - the .csv file containing the test names and commands to run the tests.")
+    print(" [partial-test-names, ...] - optional partial strings to select a subset of the tests to run.")
+    print(" [CL_DEVICE_TYPE(s) to test] - list of CL device types to test, default is CL_DEVICE_TYPE_DEFAULT.")
+    print(" [log=path/to/log/file/] - provide a path for the test log file, default is in the current directory.")
+    print("   (Note: spaces are not allowed in the log file path.")
 # Get the time formatted nicely
-def get_time() :
- return time.strftime("%d-%b %H:%M:%S", time.localtime())
+def get_time():
+    return time.strftime("%d-%b %H:%M:%S", time.localtime())
 # Write text to the screen and the log file
-def write_screen_log(text) :
- global log_file
- print(text)
- log_file.write(text+"\n")
+def write_screen_log(text):
+    global log_file
+    print(text)
+    log_file.write(text + "\n")
 # Load the tests from a csv formated file of the form name,command
 def get_tests(filename, devices_to_test):
- tests = []
- if (os.path.exists(filename) == False):
-  print("FAILED: test_list \"" + filename + "\" does not exist.")
-  print("")
-  write_help_info()
-  sys.exit(-1)
- file = open(filename, 'r')
- for line in file.readlines():
-  comment ="^#.*", line)
-  if (comment):
-   continue
-  device_specific_match ="^\s*(.+?)\s*,\s*(.+?)\s*,\s*(.+?)\s*$", line)
-  if (device_specific_match):
-   if ( in devices_to_test):
-    test_path = string.replace(, '/', os.sep)
-    test_name = string.replace(, '/', os.sep)
-    tests.append((test_name, test_path))
-   else:
-    print("Skipping " + + " because " + + " is not in the list of devices to test.")
-   continue
-  match ="^\s*(.+?)\s*,\s*(.+?)\s*$", line)
-  if (match):
-   test_path = string.replace(, '/', os.sep)
-   test_name = string.replace(, '/', os.sep)
-   tests.append((test_name, test_path))
- return tests
+    tests = []
+    if os.path.exists(filename) == False:
+        print("FAILED: test_list \"" + filename + "\" does not exist.")
+        print("")
+        write_help_info()
+        sys.exit(-1)
+    file = open(filename, 'r')
+    for line in file.readlines():
+        comment ="^#.*", line)
+        if comment:
+            continue
+        device_specific_match ="^\s*(.+?)\s*,\s*(.+?)\s*,\s*(.+?)\s*$", line)
+        if device_specific_match:
+            if in devices_to_test:
+                test_path = str.replace(, '/', os.sep)
+                test_name = str.replace(, '/', os.sep)
+                tests.append((test_name, test_path))
+            else:
+                print("Skipping " + + " because " + + " is not in the list of devices to test.")
+            continue
+        match ="^\s*(.+?)\s*,\s*(.+?)\s*$", line)
+        if match:
+            test_path = str.replace(, '/', os.sep)
+            test_name = str.replace(, '/', os.sep)
+            tests.append((test_name, test_path))
+    return tests
 def run_test_checking_output(current_directory, test_dir, log_file):
-  global process_pid, seconds_between_status_updates
-  failures_this_run = 0
-  start_time = time.time()
-  # Create a temporary file for capturing the output from the test
-  (output_fd, output_name) = tempfile.mkstemp()
-  if ( not os.path.exists(output_name)) :
-    write_screen_log("\n           ==> ERROR: could not create temporary file %s ." % output_name)
-    os.close(output_fd)
-    return -1
-  # Execute the test
-  program_to_run = test_dir_without_args = test_dir.split(None, 1)[0]
-  if ( os.sep == '\\' ) : program_to_run += ".exe"
-  if (os.path.exists(current_directory + os.sep + program_to_run)) :
-    os.chdir(os.path.dirname(current_directory+os.sep+test_dir_without_args) )
-    try:
-      if (DEBUG): p = subprocess.Popen("", stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
-      else :  p = subprocess.Popen(current_directory + os.sep + test_dir, stderr=output_fd, stdout=output_fd, shell=True)
-    except OSError:
-      write_screen_log("\n           ==> ERROR: failed to execute test. Failing test. : " + str(OSError))
-      os.close(output_fd)
-      return -1
-  else:
-    write_screen_log("\n           ==> ERROR: test file (" + current_directory + os.sep + program_to_run +") does not exist.  Failing test.")
-    os.close(output_fd)
-    return -1
-  # Set the global pid so we can kill it if this is aborted
-  process_pid =
-  # Read one character at a time from the temporary output file while the process is running.
-  # When we get an end-of-line, look for errors and write the results to the log file.
-  # This allows us to process the file as it is being produced.
-  # Keep track of the state for reading
-  # Whether we are done, if we have more to read, and where in the file we last read
-  done = False
-  more_to_read = True
-  pointer = 0
-  pointer_at_last_user_update = 0
-  output_this_run = False
-  try:
-    read_output = open(output_name, 'r')
-  except IOError:
-    write_screen_log("\n           ==> ERROR: could not open output file from test.")
-    os.close(output_fd)
-    return -1
-  line = ""
-  while (not done or more_to_read):
-    os.fsync(output_fd)
-    # Determine if we should display some output
-    elapsed_time = (time.time() - start_time)
-    if (elapsed_time > seconds_between_status_updates):
-      start_time = time.time()
-      # If we've received output from the test since the last update, display a #
-      if (pointer != pointer_at_last_user_update):
-        sys.stdout.write(":")
-      else:
-        sys.stdout.write(".")
-      pointer_at_last_user_update = pointer
-      sys.stdout.flush()
-    # Check if we're done
-    p.poll()
-    if (not done and p.returncode != None):
-      if (p.returncode < 0):
-        if (not output_this_run):
-         print ""
-         output_this_run = True
-        write_screen_log("           ==> ERROR: test killed/crashed: " + str(p.returncode)+ ".")
-      done = True
-    # Try reading
-    try:
-      char_read =
-    except IOError:
-      time.sleep(1)
-      continue
-    # If we got a full line then process it
-    if (char_read == "\n"):
-      # Look for failures and report them as such
-      match =".*(FAILED|ERROR).*", line)
-      if (match):
-        if (not output_this_run):
-         print ""
-         output_this_run = True
-        print("           ==> " + line.replace('\n',''))
-      match =".*FAILED.*", line)
-      if (match):
-        failures_this_run = failures_this_run + 1
-      match =".*(PASSED).*", line)
-      if (match):
-       if (not output_this_run):
-        print ""
-        output_this_run = True
-       print("               " + line.replace('\n',''))
-      # Write it to the log
-      log_file.write("     " + line +"\n")
-      log_file.flush()
-      line = ""
-      pointer = pointer + 1
-    # If we are at the end of the file, then re-open it to get new data
-    elif (char_read == ""):
-      more_to_read = False
-      read_output.close()
-      time.sleep(1)
-      try:
-        os.fsync(output_fd)
-        read_output = open(output_name, 'r')
-        # See if there is more to read. This happens if the process ends and we have data left.
-        if ( != ""):
-          more_to_read = True
-      except IOError:
-        write_screen_log("\n           ==> ERROR: could not reopen output file from test.")
+    global process_pid, seconds_between_status_updates
+    failures_this_run = 0
+    start_time = time.time()
+    # Create a temporary file for capturing the output from the test
+    (output_fd, output_name) = tempfile.mkstemp()
+    if not os.path.exists(output_name):
+        write_screen_log("\n           ==> ERROR: could not create temporary file %s ." % output_name)
+        os.close(output_fd)
         return -1
-        done = True
+    # Execute the test
+    program_to_run = test_dir_without_args = test_dir.split(None, 1)[0]
+    if os.sep == '\\':
+        program_to_run += ".exe"
+    if os.path.exists(current_directory + os.sep + program_to_run):
+        os.chdir(os.path.dirname(current_directory + os.sep + test_dir_without_args))
+        try:
+            if DEBUG: p = subprocess.Popen("", stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
+            else: p = subprocess.Popen(current_directory + os.sep + test_dir, stderr=output_fd, stdout=output_fd, shell=True)
+        except OSError:
+            write_screen_log("\n           ==> ERROR: failed to execute test. Failing test. : " + str(OSError))
+            os.close(output_fd)
+            return -1
-      line = line + char_read
-      pointer = pointer + 1
-  # Now we are done, so write out any remaining data in the file:
-  # This should only happen if the process exited with an error.
-  os.fsync(output_fd)
-  while ( != ""):
-    log_file.write(
-  # Return the total number of failures
-  if (p.returncode == 0 and failures_this_run > 0):
-   write_screen_log("\n           ==> ERROR: Test returned 0, but number of FAILED lines reported is " + str(failures_this_run) +".")
-   return failures_this_run
-  return p.returncode
+        write_screen_log("\n           ==> ERROR: test file (" + current_directory + os.sep + program_to_run + ") does not exist.  Failing test.")
+        os.close(output_fd)
+        return -1
+    # Set the global pid so we can kill it if this is aborted
+    process_pid =
+    # Read one character at a time from the temporary output file while the process is running.
+    # When we get an end-of-line, look for errors and write the results to the log file.
+    # This allows us to process the file as it is being produced.
+    # Keep track of the state for reading
+    # Whether we are done, if we have more to read, and where in the file we last read
+    done = False
+    more_to_read = True
+    pointer = 0
+    pointer_at_last_user_update = 0
+    output_this_run = False
+    try:
+        read_output = open(output_name, 'r')
+    except IOError:
+        write_screen_log("\n           ==> ERROR: could not open output file from test.")
+        os.close(output_fd)
+        return -1
+    line = ""
+    while not done or more_to_read:
+        os.fsync(output_fd)
+        # Determine if we should display some output
+        elapsed_time = (time.time() - start_time)
+        if elapsed_time > seconds_between_status_updates:
+            start_time = time.time()
+            # If we've received output from the test since the last update, display a #
+            if pointer != pointer_at_last_user_update:
+                sys.stdout.write(":")
+            else:
+                sys.stdout.write(".")
+            pointer_at_last_user_update = pointer
+            sys.stdout.flush()
+        # Check if we're done
+        p.poll()
+        if not done and p.returncode != None:
+            if p.returncode < 0:
+                if not output_this_run:
+                    print("")
+                    output_this_run = True
+                write_screen_log("           ==> ERROR: test killed/crashed: " + str(p.returncode) + ".")
+            done = True
+        # Try reading
+        try:
+            char_read =
+        except IOError:
+            time.sleep(1)
+            continue
+        # If we got a full line then process it
+        if char_read == "\n":
+            # Look for failures and report them as such
+            match =".*(FAILED|ERROR).*", line)
+            if match:
+                if not output_this_run:
+                    print("")
+                    output_this_run = True
+                print("           ==> " + line.replace('\n', ''))
+            match =".*FAILED.*", line)
+            if match:
+                failures_this_run = failures_this_run + 1
+            match =".*(PASSED).*", line)
+            if match:
+                if not output_this_run:
+                    print("")
+                    output_this_run = True
+                print("               " + line.replace('\n', ''))
+            # Write it to the log
+            log_file.write("     " + line + "\n")
+            log_file.flush()
+            line = ""
+            pointer = pointer + 1
+        # If we are at the end of the file, then re-open it to get new data
+        elif char_read == "":
+            more_to_read = False
+            read_output.close()
+            time.sleep(1)
+            try:
+                os.fsync(output_fd)
+                read_output = open(output_name, 'r')
+                # See if there is more to read. This happens if the process ends and we have data left.
+                if != "":
+                    more_to_read = True
+            except IOError:
+                write_screen_log("\n           ==> ERROR: could not reopen output file from test.")
+                return -1
+        else:
+            line = line + char_read
+            pointer = pointer + 1
+    # Now we are done, so write out any remaining data in the file:
+    # This should only happen if the process exited with an error.
+    os.fsync(output_fd)
+    while != "":
+        log_file.write(
+    # Return the total number of failures
+    if (p.returncode == 0 and failures_this_run > 0):
+        write_screen_log("\n           ==> ERROR: Test returned 0, but number of FAILED lines reported is " + str(failures_this_run) + ".")
+        return failures_this_run
+    return p.returncode
-def run_tests(tests) :
-  global curent_directory
-  global process_pid
-  # Run the tests
-  failures = 0
-  previous_test = None
-  test_number = 1
-  for test in tests:
-   # Print the name of the test we're running and the time
-   (test_name, test_dir) = test
-   if (test_dir != previous_test):
-    print("==========   " + test_dir)
-    log_file.write("========================================================================================\n")
-    log_file.write("========================================================================================\n")
-    log_file.write("(" + get_time() + ")     Running Tests: " + test_dir +"\n")
-    log_file.write("========================================================================================\n")
-    log_file.write("========================================================================================\n")
-    previous_test = test_dir
-   print("("+get_time()+")     BEGIN  " + test_name.ljust(40) +": "),
-   log_file.write("     ----------------------------------------------------------------------------------------\n")
-   log_file.write("     (" + get_time() + ")     Running Sub Test: " + test_name + "\n")
-   log_file.write("     ----------------------------------------------------------------------------------------\n")
-   log_file.flush()
-   sys.stdout.flush()
+def run_tests(tests):
+    global curent_directory
+    global process_pid
+    # Run the tests
+    failures = 0
+    previous_test = None
+    test_number = 1
+    for test in tests:
+        # Print the name of the test we're running and the time
+        (test_name, test_dir) = test
+        if test_dir != previous_test:
+            print("==========   " + test_dir)
+            log_file.write("========================================================================================\n")
+            log_file.write("========================================================================================\n")
+            log_file.write("(" + get_time() + ")     Running Tests: " + test_dir + "\n")
+            log_file.write("========================================================================================\n")
+            log_file.write("========================================================================================\n")
+            previous_test = test_dir
+        print("(" + get_time() + ")     BEGIN  " + test_name.ljust(40) + ": ", end='')
+        log_file.write("     ----------------------------------------------------------------------------------------\n")
+        log_file.write("     (" + get_time() + ")     Running Sub Test: " + test_name + "\n")
+        log_file.write("     ----------------------------------------------------------------------------------------\n")
+        log_file.flush()
+        sys.stdout.flush()
-   # Run the test
-   result = 0
-   start_time = time.time()
-   try:
-    process_pid = 0
-    result = run_test_checking_output(current_directory, test_dir, log_file)
-   except KeyboardInterrupt:
-    # Catch an interrupt from the user
-    write_screen_log("\nFAILED: Execution interrupted.  Killing test process, but not aborting full test run.")
-    os.kill(process_pid, 9)
-    answer = raw_input("Abort all tests? (y/n)")
-    if (answer.find("y") != -1):
-     write_screen_log("\nUser chose to abort all tests.")
-     log_file.close()
-     sys.exit(-1)
-    else:
-     write_screen_log("\nUser chose to continue with other tests. Reporting this test as failed.")
-     result = 1
-   run_time = (time.time() - start_time)
+        # Run the test
+        result = 0
+        start_time = time.time()
+        try:
+            process_pid = 0
+            result = run_test_checking_output(current_directory, test_dir, log_file)
+        except KeyboardInterrupt:
+            # Catch an interrupt from the user
+            write_screen_log("\nFAILED: Execution interrupted.  Killing test process, but not aborting full test run.")
+            os.kill(process_pid, 9)
+            if sys.version_info[0] < 3:
+                answer = raw_input("Abort all tests? (y/n)")
+            else:
+                answer = input("Abort all tests? (y/n)")
+            if answer.find("y") != -1:
+                write_screen_log("\nUser chose to abort all tests.")
+                log_file.close()
+                sys.exit(-1)
+            else:
+                write_screen_log("\nUser chose to continue with other tests. Reporting this test as failed.")
+                result = 1
+        run_time = (time.time() - start_time)
-   # Move print the finish status
-   if (result == 0):
-    print("("+get_time()+")     PASSED " + test_name.ljust(40) +": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) +")"),
-   else:
-    print("("+get_time()+")     FAILED " + test_name.ljust(40) +": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) +")"),
+        # Move print the finish status
+        if result == 0:
+            print("(" + get_time() + ")     PASSED " + test_name.ljust(40) + ": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) + ")", end='')
+        else:
+            print("(" + get_time() + ")     FAILED " + test_name.ljust(40) + ": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) + ")", end='')
-   test_number = test_number + 1
-   log_file.write("     ----------------------------------------------------------------------------------------\n")
-   log_file.flush()
+        test_number = test_number + 1
+        log_file.write("     ----------------------------------------------------------------------------------------\n")
+        log_file.flush()
-   print("")
-   if (result != 0):
-    log_file.write("  *******************************************************************************************\n")
-    log_file.write("  *  ("+get_time()+")     Test " + test_name + " ==> FAILED: " + str(result)+"\n")
-    log_file.write("  *******************************************************************************************\n")
-    failures = failures + 1
-   else:
-    log_file.write("     ("+get_time()+")     Test " + test_name +" passed in " + str(run_time) + "s\n")
+        print("")
+        if result != 0:
+            log_file.write("  *******************************************************************************************\n")
+            log_file.write("  *  (" + get_time() + ")     Test " + test_name + " ==> FAILED: " + str(result) + "\n")
+            log_file.write("  *******************************************************************************************\n")
+            failures = failures + 1
+        else:
+            log_file.write("     (" + get_time() + ")     Test " + test_name + " passed in " + str(run_time) + "s\n")
-   log_file.write("     ----------------------------------------------------------------------------------------\n")
-   log_file.write("\n")
-  return failures
+        log_file.write("     ----------------------------------------------------------------------------------------\n")
+        log_file.write("\n")
+    return failures
 # ########################
 # Begin OpenCL conformance run script
 # ########################
-if (len(sys.argv) < 2):
- write_help_info()
- sys.exit(-1)
+if len(sys.argv) < 2:
+    write_help_info()
+    sys.exit(-1)
 current_directory = os.getcwd()
 # Open the log file
 for arg in sys.argv:
- match ="log=(\S+)", arg)
- if (match):
-  log_file_name ='/') + os.sep + log_file_name
+    match ="log=(\S+)", arg)
+    if match:
+        log_file_name ='/') + os.sep + log_file_name
- log_file = open(log_file_name, "w")
+    log_file = open(log_file_name, "w")
 except IOError:
- print "Could not open log file " + log_file_name
+    print("Could not open log file " + log_file_name)
+    sys.exit(-1)
 # Determine which devices to test
 devices_to_test = []
 for device in device_types:
- if device in sys.argv[2:]:
-  devices_to_test.append(device)
-if (len(devices_to_test) == 0):
- devices_to_test = ["CL_DEVICE_TYPE_DEFAULT"]
+    if device in sys.argv[2:]:
+        devices_to_test.append(device)
+if len(devices_to_test) == 0:
+    devices_to_test = ["CL_DEVICE_TYPE_DEFAULT"]
 write_screen_log("Testing on: " + str(devices_to_test))
 # Get the tests
@@ -306,52 +315,52 @@
 tests_to_use = []
 num_of_patterns_to_match = 0
 for arg in sys.argv[2:]:
- if arg in device_types:
-  continue
- if"log=(\S+)", arg):
-  continue
- num_of_patterns_to_match = num_of_patterns_to_match + 1
- found_it = False
- for test in tests:
-  (test_name, test_dir) = test
-  if (test_name.find(arg) != -1 or test_dir.find(arg) != -1):
-   found_it = True
-   if (test not in tests_to_use):
-    tests_to_use.append(test)
- if (found_it == False):
-  print("Failed to find a test matching " + arg)
-if (len(tests_to_use) == 0):
- if (num_of_patterns_to_match > 0):
-  print("FAILED: Failed to find any tests matching the given command-line options.")
-  print("")
-  write_help_info()
-  sys.exit(-1)
+    if arg in device_types:
+        continue
+    if"log=(\S+)", arg):
+        continue
+    num_of_patterns_to_match = num_of_patterns_to_match + 1
+    found_it = False
+    for test in tests:
+        (test_name, test_dir) = test
+        if (test_name.find(arg) != -1 or test_dir.find(arg) != -1):
+            found_it = True
+            if test not in tests_to_use:
+                tests_to_use.append(test)
+    if found_it == False:
+        print("Failed to find a test matching " + arg)
+if len(tests_to_use) == 0:
+    if num_of_patterns_to_match > 0:
+        print("FAILED: Failed to find any tests matching the given command-line options.")
+        print("")
+        write_help_info()
+        sys.exit(-1)
- tests = tests_to_use[:]
+    tests = tests_to_use[:]
 write_screen_log("Test execution arguments: " + str(sys.argv))
-write_screen_log("Logging to file " + log_file_name +".")
+write_screen_log("Logging to file " + log_file_name + ".")
 write_screen_log("Loaded tests from " + sys.argv[1] + ", total of " + str(len(tests)) + " tests selected to run:")
 for (test_name, test_command) in tests:
- write_screen_log(test_name.ljust(50) + " (" + test_command +")")
+    write_screen_log(test_name.ljust(50) + " (" + test_command + ")")
 # Run the tests
 total_failures = 0
 for device_to_test in devices_to_test:
- os.environ['CL_DEVICE_TYPE'] = device_to_test
- write_screen_log("========================================================================================")
- write_screen_log("========================================================================================")
- write_screen_log(("Setting CL_DEVICE_TYPE to " + device_to_test).center(90))
- write_screen_log("========================================================================================")
- write_screen_log("========================================================================================")
- failures = run_tests(tests)
- write_screen_log("========================================================================================")
- if (failures == 0):
-  write_screen_log(">> TEST on " + device_to_test + " PASSED")
- else:
-  write_screen_log(">> TEST on " + device_to_test + " FAILED (" + str(failures) + " FAILURES)")
- write_screen_log("========================================================================================")
- total_failures = total_failures + failures
+    os.environ['CL_DEVICE_TYPE'] = device_to_test
+    write_screen_log("========================================================================================")
+    write_screen_log("========================================================================================")
+    write_screen_log(("Setting CL_DEVICE_TYPE to " + device_to_test).center(90))
+    write_screen_log("========================================================================================")
+    write_screen_log("========================================================================================")
+    failures = run_tests(tests)
+    write_screen_log("========================================================================================")
+    if failures == 0:
+        write_screen_log(">> TEST on " + device_to_test + " PASSED")
+    else:
+        write_screen_log(">> TEST on " + device_to_test + " FAILED (" + str(failures) + " FAILURES)")
+    write_screen_log("========================================================================================")
+    total_failures = total_failures + failures
-write_screen_log("("+get_time()+") Testing complete.  " + str(total_failures) + " failures for " + str(len(tests)) + " tests.")
+write_screen_log("(" + get_time() + ") Testing complete.  " + str(total_failures) + " failures for " + str(len(tests)) + " tests.")
diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp
index 35f154a..972a53c 100644
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2017 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -79,7 +79,6 @@
 // sub tests which is for each individual test.  The following
 // tracks the subtests
 int s_test_cnt = 0;
-int s_test_fail = 0;
 // Static helper functions
@@ -174,8 +173,6 @@
     char extension[128] = "";
     int  err = 0;
-    int i; // generic, re-usable loop variable
     const char *source[] = {
         "__kernel void ", testname,
@@ -297,6 +294,7 @@
 static int doTest(cl_command_queue queue, cl_context context, Type stype, Type cmptype, cl_device_id device)
     int err = CL_SUCCESS;
+    int s_test_fail = 0;
     MTdata    d;
     const size_t element_count[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
     cl_mem src1 = NULL;
@@ -468,6 +466,11 @@
+    if (s_test_fail)
+    {
+        err = TEST_FAIL;
+        gFailCount++;
+    }
     return err;
@@ -636,7 +639,6 @@
         s_wimpy_mode = true;
-    log_info( "Test binary built %s %s\n", __DATE__, __TIME__ );
     if (s_wimpy_mode) {
         log_info("*** WARNING: Testing in Wimpy mode!                     ***\n");
@@ -665,4 +667,3 @@
         log_info( "\t%s\n", test_list[i].name );
diff --git a/test_conformance/spir/main.cpp b/test_conformance/spir/main.cpp
index 3a18988..06caf33 100644
--- a/test_conformance/spir/main.cpp
+++ b/test_conformance/spir/main.cpp
@@ -6615,40 +6615,45 @@
 static const sub_suite spir_suites[] = {
-    {"api",                         "api",                       test_api},
-    {"api_double",                  "api",                       test_api_double},
-    {"atomics",                     "atomics",                   test_atomics},
-    {"basic",                       "basic",                     test_basic},
-    {"basic_double",                "basic",                     test_basic_double},
-    {"commonfns",                   "commonfns",                 test_commonfns},
-    {"commonfns_double",            "commonfns",                 test_commonfns_double},
-    {"conversions",                 "conversions",               test_conversions},
-    {"conversions_double",          "conversions",               test_conversions_double},
-    {"geometrics",                  "geometrics",                test_geometrics},
-    {"geometrics_double",           "geometrics",                test_geometrics_double},
-    {"half",                        "half",                      test_half},
-    {"half_double",                 "half",                      test_half_double},
-    {"kernel_image_methods",        "kernel_image_methods",      test_kernel_image_methods},
-    {"images_kernel_read_write",    "images_kernel_read_write",  test_images_kernel_read_write},
-    {"images_samplerlessRead",      "images_samplerlessRead",    test_images_samplerless_read},
-    {"integer_ops",                 "integer_ops",               test_integer_ops},
-    {"math_brute_force",            "math_brute_force",          test_math_brute_force},
-    {"math_brute_force_double",     "math_brute_force",          test_math_brute_force_double},
-    {"printf",                      "printf",                    test_printf},
-    {"profiling",                   "profiling",                 test_profiling},
-    {"relationals",                 "relationals",               test_relationals},
-    {"relationals_double",          "relationals",               test_relationals_double},
-    {"select",                      "select",                    test_select},
-    {"select_double",               "select",                    test_select_double},
-    {"vec_align",                   "vec_align",                 test_vec_align},
-    {"vec_align_double",            "vec_align",                 test_vec_align_double},
-    {"vec_step",                    "vec_step",                  test_vec_step},
-    {"vec_step_double",             "vec_step",                  test_vec_step_double},
-    {"compile_and_link",            "compile_and_link",          test_compile_and_link},
-    {"sampler_enumeration",         "sampler_enumeration",       test_sampler_enumeration},
-    {"enum_values",                 "enum_values",               test_enum_values},
-    {"kernel_attributes",           "kernel_attributes",         test_kernel_attributes},
-    {"binary_type",                  "binary_type",              test_binary_type},
+    { "api", "api", test_api },
+    { "api_double", "api", test_api_double },
+    { "atomics", "atomics", test_atomics },
+    { "basic", "basic", test_basic },
+    { "basic_double", "basic", test_basic_double },
+    { "commonfns", "commonfns", test_commonfns },
+    { "commonfns_double", "commonfns", test_commonfns_double },
+    { "conversions", "conversions", test_conversions },
+    { "conversions_double", "conversions", test_conversions_double },
+    { "geometrics", "geometrics", test_geometrics },
+    { "geometrics_double", "geometrics", test_geometrics_double },
+    { "half", "half", test_half },
+    { "half_double", "half", test_half_double },
+    { "kernel_image_methods", "kernel_image_methods",
+      test_kernel_image_methods },
+    { "images_kernel_read_write", "images_kernel_read_write",
+      test_images_kernel_read_write },
+    { "images_samplerlessRead", "images_samplerlessRead",
+      test_images_samplerless_read },
+    { "integer_ops", "integer_ops", test_integer_ops },
+    { "math_brute_force", "math_brute_force", test_math_brute_force },
+    { "math_brute_force_double", "math_brute_force",
+      test_math_brute_force_double },
+    { "printf", "printf", test_printf },
+    { "profiling", "profiling", test_profiling },
+    { "relationals", "relationals", test_relationals },
+    { "relationals_double", "relationals", test_relationals_double },
+    { "select", "select", test_select },
+    { "select_double", "select", test_select_double },
+    { "vec_align", "vec_align", test_vec_align },
+    { "vec_align_double", "vec_align", test_vec_align_double },
+    { "vec_step", "vec_step", test_vec_step },
+    { "vec_step_double", "vec_step", test_vec_step_double },
+    { "compile_and_link", "compile_and_link", test_compile_and_link },
+    { "sampler_enumeration", "sampler_enumeration", test_sampler_enumeration },
+    { "enum_values", "enum_values", test_enum_values },
+    // {"kernel_attributes",           "kernel_attributes",
+    // test_kernel_attributes}, // disabling temporarily, see GitHub #1284
+    { "binary_type", "binary_type", test_binary_type },
diff --git a/test_conformance/spir/run_services.cpp b/test_conformance/spir/run_services.cpp
index 06fc418..6e06d53 100644
--- a/test_conformance/spir/run_services.cpp
+++ b/test_conformance/spir/run_services.cpp
@@ -213,7 +213,6 @@
     int error = CL_SUCCESS;
     cl_kernel kernel = NULL;
-    cl_device_id device = get_program_device(program);
     /* And create a kernel from it */
     kernel = clCreateKernel( program, kernel_name.c_str(), &error );
     if( kernel == NULL || error != CL_SUCCESS)
@@ -389,6 +388,7 @@
         ret = ret | OclExtensions::fromString(*it);
     return ret;
@@ -399,75 +399,80 @@
 OclExtensions OclExtensions::fromString(const std::string& e)
-    std::string s = "OclExtensions::" + e;
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_int64_base_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_int64_extended_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_3d_image_writes);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_fp16);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_sharing);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_event);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_d3d10_sharing);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_dx9_media_sharing);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_d3d11_sharing);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_depth_images);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_depth_images);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_msaa_sharing);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_image2d_from_buffer);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_initialize_memory);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_spir);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_fp64);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_global_int32_base_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_global_int32_extended_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_local_int32_base_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_local_int32_extended_atomics);
-    RETURN_IF_ENUM(s, OclExtensions::cl_khr_byte_addressable_store);
-    RETURN_IF_ENUM(s, OclExtensions::cles_khr_int64);
-    RETURN_IF_ENUM(s, OclExtensions::cles_khr_2d_image_array_writes);
+    std::string s = "OclExtensions::has_" + e;
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_int64_base_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_int64_extended_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_3d_image_writes);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_fp16);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_sharing);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_event);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_d3d10_sharing);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_dx9_media_sharing);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_d3d11_sharing);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_depth_images);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_depth_images);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_msaa_sharing);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_image2d_from_buffer);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_initialize_memory);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_spir);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_fp64);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_global_int32_base_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_global_int32_extended_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_local_int32_base_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_local_int32_extended_atomics);
+    RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_byte_addressable_store);
+    RETURN_IF_ENUM(s, OclExtensions::has_cles_khr_int64);
+    RETURN_IF_ENUM(s, OclExtensions::has_cles_khr_2d_image_array_writes);
     // Unknown KHR string.
     return OclExtensions::empty();
 std::string OclExtensions::toString()
-    #define APPEND_STR_IF_SUPPORTS( STR, E)                          \
-        if ( this->supports(E) )                                     \
-        {                                                            \
-            std::string ext_str( #E );                               \
-            std::string prefix = "OclExtensions::";                  \
-            size_t pos = ext_str.find( prefix );                     \
-            if (  pos != std::string::npos )                         \
-            {                                                        \
-                ext_str.replace( pos, prefix.length(), "");          \
-            }                                                        \
-            STR += ext_str;                                          \
-        }
+#define APPEND_STR_IF_SUPPORTS(STR, E)                                         \
+    if (this->supports(E))                                                     \
+    {                                                                          \
+        std::string ext_str(#E);                                               \
+        std::string prefix = "OclExtensions::has_";                            \
+        size_t pos = ext_str.find(prefix);                                     \
+        if (pos != std::string::npos)                                          \
+        {                                                                      \
+            ext_str.replace(pos, prefix.length(), "");                         \
+        }                                                                      \
+        STR += ext_str;                                                        \
+        STR += " ";                                                            \
+    }
     std::string s = "";
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_int64_base_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_int64_extended_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_3d_image_writes );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_fp16 );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_sharing );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_event );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_d3d10_sharing );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_dx9_media_sharing );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_d3d11_sharing );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_depth_images );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_depth_images );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_msaa_sharing );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_image2d_from_buffer );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_initialize_memory );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_spir );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_fp64 );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_global_int32_base_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_global_int32_extended_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_local_int32_base_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_local_int32_extended_atomics );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_byte_addressable_store );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cles_khr_int64 );
-    APPEND_STR_IF_SUPPORTS( s, OclExtensions::cles_khr_2d_image_array_writes );
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_int64_base_atomics);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_int64_extended_atomics);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_3d_image_writes);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_fp16);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_sharing);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_event);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_d3d10_sharing);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_dx9_media_sharing);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_d3d11_sharing);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_depth_images);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_depth_images);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_msaa_sharing);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_image2d_from_buffer);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_initialize_memory);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_spir);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_fp64);
+                           OclExtensions::has_cl_khr_global_int32_base_atomics);
+        s, OclExtensions::has_cl_khr_global_int32_extended_atomics);
+                           OclExtensions::has_cl_khr_local_int32_base_atomics);
+        s, OclExtensions::has_cl_khr_local_int32_extended_atomics);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_byte_addressable_store);
+    APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cles_khr_int64);
+                           OclExtensions::has_cles_khr_2d_image_array_writes);
     return s;
diff --git a/test_conformance/spir/run_services.h b/test_conformance/spir/run_services.h
index 6bac4c9..10f0d05 100644
--- a/test_conformance/spir/run_services.h
+++ b/test_conformance/spir/run_services.h
@@ -113,42 +113,33 @@
     OclExtensions(size_t ext) : m_extVector(ext) {}
-// Fix a compilation error, since cl_khr_gl_sharing is defined as a macro.
-#ifdef cl_khr_gl_sharing
-#undef cl_khr_gl_sharing
-#ifdef cl_khr_icd
-#undef cl_khr_icd
     enum ClKhrs
-        no_extensions                 = KhrValue<0>::Mask,
-        cl_khr_int64_base_atomics     = KhrValue<1>::Mask,
-        cl_khr_int64_extended_atomics = KhrValue<2>::Mask,
-        cl_khr_3d_image_writes        = KhrValue<3>::Mask,
-        cl_khr_fp16                   = KhrValue<4>::Mask,
-        cl_khr_gl_sharing             = KhrValue<5>::Mask,
-        cl_khr_gl_event               = KhrValue<6>::Mask,
-        cl_khr_d3d10_sharing          = KhrValue<7>::Mask,
-        cl_khr_dx9_media_sharing      = KhrValue<8>::Mask,
-        cl_khr_d3d11_sharing          = KhrValue<9>::Mask,
-        cl_khr_depth_images           = KhrValue<10>::Mask,
-        cl_khr_gl_depth_images        = KhrValue<11>::Mask,
-        cl_khr_gl_msaa_sharing        = KhrValue<12>::Mask,
-        cl_khr_image2d_from_buffer    = KhrValue<13>::Mask,
-        cl_khr_initialize_memory      = KhrValue<14>::Mask,
-        cl_khr_context_abort          = KhrValue<15>::Mask,
-        cl_khr_spir                   = KhrValue<16>::Mask,
-        cl_khr_fp64                   = KhrValue<17>::Mask,
-        cl_khr_global_int32_base_atomics     = KhrValue<18>::Mask,
-        cl_khr_global_int32_extended_atomics = KhrValue<19>::Mask,
-        cl_khr_local_int32_base_atomics      = KhrValue<20>::Mask,
-        cl_khr_local_int32_extended_atomics  = KhrValue<21>::Mask,
-        cl_khr_byte_addressable_store        = KhrValue<22>::Mask,
-        cles_khr_int64                         = KhrValue<23>::Mask,
-        cles_khr_2d_image_array_writes         = KhrValue<24>::Mask,
+        no_extensions = KhrValue<0>::Mask,
+        has_cl_khr_int64_base_atomics = KhrValue<1>::Mask,
+        has_cl_khr_int64_extended_atomics = KhrValue<2>::Mask,
+        has_cl_khr_3d_image_writes = KhrValue<3>::Mask,
+        has_cl_khr_fp16 = KhrValue<4>::Mask,
+        has_cl_khr_gl_sharing = KhrValue<5>::Mask,
+        has_cl_khr_gl_event = KhrValue<6>::Mask,
+        has_cl_khr_d3d10_sharing = KhrValue<7>::Mask,
+        has_cl_khr_dx9_media_sharing = KhrValue<8>::Mask,
+        has_cl_khr_d3d11_sharing = KhrValue<9>::Mask,
+        has_cl_khr_depth_images = KhrValue<10>::Mask,
+        has_cl_khr_gl_depth_images = KhrValue<11>::Mask,
+        has_cl_khr_gl_msaa_sharing = KhrValue<12>::Mask,
+        has_cl_khr_image2d_from_buffer = KhrValue<13>::Mask,
+        has_cl_khr_initialize_memory = KhrValue<14>::Mask,
+        has_cl_khr_context_abort = KhrValue<15>::Mask,
+        has_cl_khr_spir = KhrValue<16>::Mask,
+        has_cl_khr_fp64 = KhrValue<17>::Mask,
+        has_cl_khr_global_int32_base_atomics = KhrValue<18>::Mask,
+        has_cl_khr_global_int32_extended_atomics = KhrValue<19>::Mask,
+        has_cl_khr_local_int32_base_atomics = KhrValue<20>::Mask,
+        has_cl_khr_local_int32_extended_atomics = KhrValue<21>::Mask,
+        has_cl_khr_byte_addressable_store = KhrValue<22>::Mask,
+        has_cles_khr_int64 = KhrValue<23>::Mask,
+        has_cles_khr_2d_image_array_writes = KhrValue<24>::Mask,
     size_t m_extVector;
diff --git a/test_conformance/spir/ b/test_conformance/spir/
index 5f8a7a0..ab9c9a5 100644
--- a/test_conformance/spir/
+++ b/test_conformance/spir/
Binary files differ
diff --git a/test_conformance/spirv_new/main.cpp b/test_conformance/spirv_new/main.cpp
index 5a8664b..4156683 100644
--- a/test_conformance/spirv_new/main.cpp
+++ b/test_conformance/spirv_new/main.cpp
@@ -203,7 +203,6 @@
 test_status InitCL(cl_device_id id)
     test_status spirv_status;
-    bool force = true;
     spirv_status = check_spirv_compilation_readiness(id);
     if (spirv_status != TEST_PASS)
diff --git a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
index 9e1789c..0728ea0 100644
--- a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
+++ b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
@@ -1,219 +1,218 @@

-Copyright (c) 2018 The Khronos Group Inc. All Rights Reserved.


-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.

-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to

-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,

-broadcast or otherwise exploited in any manner without the express prior written permission

-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,

-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,

-in whole or in part other than under the terms of the Khronos Adopters Agreement

-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.



-#include "testBase.h"

-#include "types.hpp"


-#include <sstream>

-#include <string>

-#include <type_traits>



-template<typename T>

-int test_ext_cl_khr_spirv_no_integer_wrap_decoration(cl_device_id deviceID,

-               cl_context context,

-               cl_command_queue queue,

-               const char *spvName,

-               const char *funcName,

-               const char *Tname)



-    cl_int err = CL_SUCCESS;

-    const int num = 10;

-    std::vector<T> h_lhs(num);

-    std::vector<T> h_rhs(num);

-    std::vector<T> expected_results(num);

-    std::vector<T> h_ref(num);

-    if (!is_extension_available(deviceID, "cl_khr_spirv_no_integer_wrap_decoration")) {

-        log_info("Extension cl_khr_spirv_no_integer_wrap_decoration not supported; skipping tests.\n");

-        return 0;

-    }


-    /*Test with some values that do not cause overflow*/

-    if (std::is_signed<T>::value == true) {

-        h_lhs.push_back((T)-25000);

-        h_lhs.push_back((T)-3333);

-        h_lhs.push_back((T)-7);

-        h_lhs.push_back((T)-1);

-        h_lhs.push_back(0);

-        h_lhs.push_back(1);

-        h_lhs.push_back(1024);

-        h_lhs.push_back(2048);

-        h_lhs.push_back(4094);

-        h_lhs.push_back(10000);

-    } else {

-        h_lhs.push_back(0);

-        h_lhs.push_back(1);

-        h_lhs.push_back(3);

-        h_lhs.push_back(5);

-        h_lhs.push_back(10);

-        h_lhs.push_back(100);

-        h_lhs.push_back(1024);

-        h_lhs.push_back(2048);

-        h_lhs.push_back(4094);

-        h_lhs.push_back(52888);

-    }


-    h_rhs.push_back(0);

-    h_rhs.push_back(1);

-    h_rhs.push_back(2);

-    h_rhs.push_back(3);

-    h_rhs.push_back(4);

-    h_rhs.push_back(5);

-    h_rhs.push_back(6);

-    h_rhs.push_back(7);

-    h_rhs.push_back(8);

-    h_rhs.push_back(9);

-    size_t bytes = num * sizeof(T);


-    clMemWrapper lhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);

-    SPIRV_CHECK_ERROR(err, "Failed to create lhs buffer");


-    err = clEnqueueWriteBuffer(queue, lhs, CL_TRUE, 0, bytes, &h_lhs[0], 0, NULL, NULL);

-    SPIRV_CHECK_ERROR(err, "Failed to copy to lhs buffer");


-    clMemWrapper rhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);

-    SPIRV_CHECK_ERROR(err, "Failed to create rhs buffer");


-    err = clEnqueueWriteBuffer(queue, rhs, CL_TRUE, 0, bytes, &h_rhs[0], 0, NULL, NULL);

-    SPIRV_CHECK_ERROR(err, "Failed to copy to rhs buffer");


-    std::string kernelStr;


-    {

-        std::stringstream kernelStream;

-        kernelStream << "#define spirv_fadd(a, b) (a) + (b)               \n";

-        kernelStream << "#define spirv_fsub(a, b) (a) - (b)               \n";

-        kernelStream << "#define spirv_fmul(a, b) (a) * (b)               \n";

-        kernelStream << "#define spirv_fshiftleft(a, b) (a) << (b)        \n";

-        kernelStream << "#define spirv_fnegate(a, b)  (-a)                \n";


-        kernelStream << "#define T " << Tname                         << "\n";

-        kernelStream << "#define FUNC spirv_" << funcName             << "\n";

-        kernelStream << "__kernel void fmath_cl(__global T *out,          \n";

-        kernelStream << "const __global T *lhs, const __global T *rhs)    \n";

-        kernelStream << "{                                                \n";

-        kernelStream << "    int id = get_global_id(0);                   \n";

-        kernelStream << "    out[id] = FUNC(lhs[id], rhs[id]);            \n";

-        kernelStream << "}                                                \n";

-        kernelStr = kernelStream.str();

-    }


-    size_t kernelLen = kernelStr.size();

-    const char *kernelBuf = kernelStr.c_str();


-    for (int i = 0; i < num; i++) {

-        if (std::string(funcName) == std::string("fadd")) {

-            expected_results[i] = h_lhs[i] + h_rhs[i];

-        } else if (std::string(funcName) == std::string("fsub")) {

-            expected_results[i] = h_lhs[i] - h_rhs[i];

-        } else if (std::string(funcName) == std::string("fmul")) {

-            expected_results[i] = h_lhs[i] * h_rhs[i];

-        } else if (std::string(funcName) == std::string("fshiftleft")) {

-            expected_results[i] = h_lhs[i] << h_rhs[i];

-        } else if (std::string(funcName) == std::string("fnegate")) {

-            expected_results[i] = 0 - h_lhs[i];

-        }

-    }


-    {

-        // Run the cl kernel for reference results

-        clProgramWrapper prog;

-        clKernelWrapper kernel;

-        err = create_single_kernel_helper(context, &prog, &kernel, 1,

-                                          &kernelBuf, "fmath_cl");

-        SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");


-        clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);

-        SPIRV_CHECK_ERROR(err, "Failed to create ref buffer");


-        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ref);

-        SPIRV_CHECK_ERROR(err, "Failed to set arg 0");


-        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);

-        SPIRV_CHECK_ERROR(err, "Failed to set arg 1");


-        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);

-        SPIRV_CHECK_ERROR(err, "Failed to set arg 2");


-        size_t global = num;

-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);

-        SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");


-        err = clEnqueueReadBuffer(queue, ref, CL_TRUE, 0, bytes, &h_ref[0], 0, NULL, NULL);

-        SPIRV_CHECK_ERROR(err, "Failed to read from ref");

-    }


-    for (int i = 0; i < num; i++) {

-        if (expected_results[i] != h_ref[i]) {

-            log_error("Values do not match at index %d expected = %d got = %d\n", i, expected_results[i], h_ref[i]);

-            return -1;

-        }

-    }


-    clProgramWrapper prog;

-    err = get_program_with_il(prog, deviceID, context, spvName);

-    SPIRV_CHECK_ERROR(err, "Failed to build program");


-    clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err);

-    SPIRV_CHECK_ERROR(err, "Failed to create spv kernel");


-    clMemWrapper res = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);

-    SPIRV_CHECK_ERROR(err, "Failed to create res buffer");


-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &res);

-    SPIRV_CHECK_ERROR(err, "Failed to set arg 0");


-    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);

-    SPIRV_CHECK_ERROR(err, "Failed to set arg 1");


-    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);

-    SPIRV_CHECK_ERROR(err, "Failed to set arg 2");


-    size_t global = num;

-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);

-    SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");


-    std::vector<T> h_res(num);

-    err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL);

-    SPIRV_CHECK_ERROR(err, "Failed to read from ref");


-    for (int i = 0; i < num; i++) {

-        if (expected_results[i] != h_res[i]) {

-            log_error("Values do not match at location %d expected = %d got = %d\n", i, expected_results[i], h_res[i]);

-            return -1;

-        }

-    }


-    return 0;



-#define TEST_FMATH_FUNC(TYPE, FUNC)                                                              \

-    TEST_SPIRV_FUNC(ext_cl_khr_spirv_no_integer_wrap_decoration_##FUNC##_##TYPE)                 \

-    {                                                                                            \

-        return test_ext_cl_khr_spirv_no_integer_wrap_decoration<cl_##TYPE>(deviceID, context, queue, \

-                          "ext_cl_khr_spirv_no_integer_wrap_decoration_"#FUNC"_"#TYPE,           \

-                          #FUNC,                                                                 \

-                          #TYPE                                                                  \

-                          );                                                                     \

-    }


-TEST_FMATH_FUNC(int, fadd)

-TEST_FMATH_FUNC(int, fsub)

-TEST_FMATH_FUNC(int, fmul)

-TEST_FMATH_FUNC(int, fshiftleft)

-TEST_FMATH_FUNC(int, fnegate)

-TEST_FMATH_FUNC(uint, fadd)

-TEST_FMATH_FUNC(uint, fsub)

-TEST_FMATH_FUNC(uint, fmul)

-TEST_FMATH_FUNC(uint, fshiftleft)
\ No newline at end of file
+Copyright (c) 2018 The Khronos Group Inc. All Rights Reserved.
+This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
+This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
+third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
+broadcast or otherwise exploited in any manner without the express prior written permission
+of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
+disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
+in whole or in part other than under the terms of the Khronos Adopters Agreement
+or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
+#include "testBase.h"
+#include "types.hpp"
+#include <sstream>
+#include <string>
+#include <type_traits>
+template<typename T>
+int test_ext_cl_khr_spirv_no_integer_wrap_decoration(cl_device_id deviceID,
+               cl_context context,
+               cl_command_queue queue,
+               const char *spvName,
+               const char *funcName,
+               const char *Tname)
+    cl_int err = CL_SUCCESS;
+    const int num = 10;
+    std::vector<T> h_lhs(num);
+    std::vector<T> h_rhs(num);
+    std::vector<T> expected_results(num);
+    std::vector<T> h_ref(num);
+    if (!is_extension_available(deviceID, "cl_khr_spirv_no_integer_wrap_decoration")) {
+        log_info("Extension cl_khr_spirv_no_integer_wrap_decoration not supported; skipping tests.\n");
+        return 0;
+    }
+    /*Test with some values that do not cause overflow*/
+    if (std::is_signed<T>::value == true) {
+        h_lhs.push_back((T)-25000);
+        h_lhs.push_back((T)-3333);
+        h_lhs.push_back((T)-7);
+        h_lhs.push_back((T)-1);
+        h_lhs.push_back(0);
+        h_lhs.push_back(1);
+        h_lhs.push_back(1024);
+        h_lhs.push_back(2048);
+        h_lhs.push_back(4094);
+        h_lhs.push_back(10000);
+    } else {
+        h_lhs.push_back(0);
+        h_lhs.push_back(1);
+        h_lhs.push_back(3);
+        h_lhs.push_back(5);
+        h_lhs.push_back(10);
+        h_lhs.push_back(100);
+        h_lhs.push_back(1024);
+        h_lhs.push_back(2048);
+        h_lhs.push_back(4094);
+        h_lhs.push_back(52888);
+    }
+    h_rhs.push_back(0);
+    h_rhs.push_back(1);
+    h_rhs.push_back(2);
+    h_rhs.push_back(3);
+    h_rhs.push_back(4);
+    h_rhs.push_back(5);
+    h_rhs.push_back(6);
+    h_rhs.push_back(7);
+    h_rhs.push_back(8);
+    h_rhs.push_back(9);
+    size_t bytes = num * sizeof(T);
+    clMemWrapper lhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create lhs buffer");
+    err = clEnqueueWriteBuffer(queue, lhs, CL_TRUE, 0, bytes, &h_lhs[0], 0, NULL, NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to copy to lhs buffer");
+    clMemWrapper rhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create rhs buffer");
+    err = clEnqueueWriteBuffer(queue, rhs, CL_TRUE, 0, bytes, &h_rhs[0], 0, NULL, NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to copy to rhs buffer");
+    std::string kernelStr;
+    {
+        std::stringstream kernelStream;
+        kernelStream << "#define spirv_fadd(a, b) (a) + (b)               \n";
+        kernelStream << "#define spirv_fsub(a, b) (a) - (b)               \n";
+        kernelStream << "#define spirv_fmul(a, b) (a) * (b)               \n";
+        kernelStream << "#define spirv_fshiftleft(a, b) (a) << (b)        \n";
+        kernelStream << "#define spirv_fnegate(a, b)  (-a)                \n";
+        kernelStream << "#define T " << Tname                         << "\n";
+        kernelStream << "#define FUNC spirv_" << funcName             << "\n";
+        kernelStream << "__kernel void fmath_cl(__global T *out,          \n";
+        kernelStream << "const __global T *lhs, const __global T *rhs)    \n";
+        kernelStream << "{                                                \n";
+        kernelStream << "    int id = get_global_id(0);                   \n";
+        kernelStream << "    out[id] = FUNC(lhs[id], rhs[id]);            \n";
+        kernelStream << "}                                                \n";
+        kernelStr = kernelStream.str();
+    }
+    const char *kernelBuf = kernelStr.c_str();
+    for (int i = 0; i < num; i++) {
+        if (std::string(funcName) == std::string("fadd")) {
+            expected_results[i] = h_lhs[i] + h_rhs[i];
+        } else if (std::string(funcName) == std::string("fsub")) {
+            expected_results[i] = h_lhs[i] - h_rhs[i];
+        } else if (std::string(funcName) == std::string("fmul")) {
+            expected_results[i] = h_lhs[i] * h_rhs[i];
+        } else if (std::string(funcName) == std::string("fshiftleft")) {
+            expected_results[i] = h_lhs[i] << h_rhs[i];
+        } else if (std::string(funcName) == std::string("fnegate")) {
+            expected_results[i] = 0 - h_lhs[i];
+        }
+    }
+    {
+        // Run the cl kernel for reference results
+        clProgramWrapper prog;
+        clKernelWrapper kernel;
+        err = create_single_kernel_helper(context, &prog, &kernel, 1,
+                                          &kernelBuf, "fmath_cl");
+        SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");
+        clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
+        SPIRV_CHECK_ERROR(err, "Failed to create ref buffer");
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ref);
+        SPIRV_CHECK_ERROR(err, "Failed to set arg 0");
+        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);
+        SPIRV_CHECK_ERROR(err, "Failed to set arg 1");
+        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);
+        SPIRV_CHECK_ERROR(err, "Failed to set arg 2");
+        size_t global = num;
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+        SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");
+        err = clEnqueueReadBuffer(queue, ref, CL_TRUE, 0, bytes, &h_ref[0], 0, NULL, NULL);
+        SPIRV_CHECK_ERROR(err, "Failed to read from ref");
+    }
+    for (int i = 0; i < num; i++) {
+        if (expected_results[i] != h_ref[i]) {
+            log_error("Values do not match at index %d expected = %d got = %d\n", i, expected_results[i], h_ref[i]);
+            return -1;
+        }
+    }
+    clProgramWrapper prog;
+    err = get_program_with_il(prog, deviceID, context, spvName);
+    SPIRV_CHECK_ERROR(err, "Failed to build program");
+    clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create spv kernel");
+    clMemWrapper res = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create res buffer");
+    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &res);
+    SPIRV_CHECK_ERROR(err, "Failed to set arg 0");
+    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);
+    SPIRV_CHECK_ERROR(err, "Failed to set arg 1");
+    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);
+    SPIRV_CHECK_ERROR(err, "Failed to set arg 2");
+    size_t global = num;
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");
+    std::vector<T> h_res(num);
+    err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to read from ref");
+    for (int i = 0; i < num; i++) {
+        if (expected_results[i] != h_res[i]) {
+            log_error("Values do not match at location %d expected = %d got = %d\n", i, expected_results[i], h_res[i]);
+            return -1;
+        }
+    }
+    return 0;
+#define TEST_FMATH_FUNC(TYPE, FUNC)                                                              \
+    TEST_SPIRV_FUNC(ext_cl_khr_spirv_no_integer_wrap_decoration_##FUNC##_##TYPE)                 \
+    {                                                                                            \
+        return test_ext_cl_khr_spirv_no_integer_wrap_decoration<cl_##TYPE>(deviceID, context, queue, \
+                          "ext_cl_khr_spirv_no_integer_wrap_decoration_"#FUNC"_"#TYPE,           \
+                          #FUNC,                                                                 \
+                          #TYPE                                                                  \
+                          );                                                                     \
+    }
+TEST_FMATH_FUNC(int, fadd)
+TEST_FMATH_FUNC(int, fsub)
+TEST_FMATH_FUNC(int, fmul)
+TEST_FMATH_FUNC(int, fshiftleft)
+TEST_FMATH_FUNC(int, fnegate)
+TEST_FMATH_FUNC(uint, fadd)
+TEST_FMATH_FUNC(uint, fsub)
+TEST_FMATH_FUNC(uint, fmul)
+TEST_FMATH_FUNC(uint, fshiftleft)
diff --git a/test_conformance/spirv_new/test_op_fmath.cpp b/test_conformance/spirv_new/test_op_fmath.cpp
index bec0667..61e2864 100644
--- a/test_conformance/spirv_new/test_op_fmath.cpp
+++ b/test_conformance/spirv_new/test_op_fmath.cpp
@@ -79,11 +79,8 @@
         kernelStr = kernelStream.str();
-    size_t kernelLen = kernelStr.size();
     const char *kernelBuf = kernelStr.c_str();
-    const char *options = fast_math ? "-cl-fast-relaxed-math" : NULL;
     std::vector<T> h_ref(num);
diff --git a/test_conformance/spirv_new/test_op_function.cpp b/test_conformance/spirv_new/test_op_function.cpp
index caa3e0d..16183e8 100644
--- a/test_conformance/spirv_new/test_op_function.cpp
+++ b/test_conformance/spirv_new/test_op_function.cpp
@@ -33,7 +33,6 @@
     err = clEnqueueWriteBuffer(queue, in, CL_TRUE, 0, bytes, &h_in[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to copy to in buffer");
-    cl_uint bits = sizeof(void *) * 8;
     std::string spvStr = std::string("op_function") + "_" + std::string(funcType);
     const char *spvName = spvStr.c_str();
diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp
index 1891c9b..e3dc1f3 100644
--- a/test_conformance/spirv_new/test_op_negate.cpp
+++ b/test_conformance/spirv_new/test_op_negate.cpp
@@ -43,7 +43,6 @@
     err = clEnqueueWriteBuffer(queue, in, CL_TRUE, 0, bytes, &h_in[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to copy to in buffer");
-    cl_uint bits = sizeof(void *) * 8;
     std::string spvStr = std::string(funcName) + "_" + std::string(Tname);
     const char *spvName = spvStr.c_str();
diff --git a/test_conformance/spirv_new/test_op_opaque.cpp b/test_conformance/spirv_new/test_op_opaque.cpp
index 067d9e4..e621606 100644
--- a/test_conformance/spirv_new/test_op_opaque.cpp
+++ b/test_conformance/spirv_new/test_op_opaque.cpp
@@ -17,7 +17,6 @@
     const char *name = "opaque";
-    int num = (int)(1 << 10);
     cl_int err = CL_SUCCESS;
     std::vector<unsigned char> buffer_vec = readSPIRV(name);
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 0a604bc..0859668 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -75,7 +75,6 @@
         kernelStr = kernelStream.str();
-    size_t kernelLen = kernelStr.size();
     const char *kernelBuf = kernelStr.c_str();
     std::vector<Tv> h_ref(num);
@@ -107,7 +106,6 @@
         SPIRV_CHECK_ERROR(err, "Failed to read from ref");
-    cl_uint bits = sizeof(void *) * 8;
     std::string ref = "vector_times_scalar_";
     ref += Tname;
     const char *spvName = ref.c_str();
diff --git a/test_conformance/subgroups/CMakeLists.txt b/test_conformance/subgroups/CMakeLists.txt
index d48af9c..1ff249c 100644
--- a/test_conformance/subgroups/CMakeLists.txt
+++ b/test_conformance/subgroups/CMakeLists.txt
@@ -15,6 +15,7 @@
+    test_subgroup_rotate.cpp
diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp
index 44416dd..a3ae910 100644
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -19,8 +19,10 @@
 #include <string.h>
 #include "procs.h"
 #include "harness/testHarness.h"
+#include "CL/cl_half.h"
 MTdata gMTdata;
+cl_half_rounding_mode g_rounding_mode;
 test_definition test_list[] = {
     ADD_TEST_VERSION(sub_group_info_ext, Version(2, 0)),
@@ -39,7 +41,8 @@
-    ADD_TEST(subgroup_functions_shuffle_relative)
+    ADD_TEST(subgroup_functions_shuffle_relative),
+    ADD_TEST(subgroup_functions_rotate),
 const int test_num = ARRAY_SIZE(test_list);
@@ -66,6 +69,22 @@
             ret = TEST_SKIP;
+    // Determine the rounding mode to be used in float to half conversions in
+    // init and reference code
+    const cl_device_fp_config fpConfig = get_default_rounding_mode(device);
+    if (fpConfig == CL_FP_ROUND_TO_NEAREST)
+    {
+        g_rounding_mode = CL_HALF_RTE;
+    }
+    else if (fpConfig == CL_FP_ROUND_TO_ZERO && gIsEmbedded)
+    {
+        g_rounding_mode = CL_HALF_RTZ;
+    }
+    else
+    {
+        assert(false && "Unreachable");
+    }
     return ret;
diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h
index d09e824..d4f51be 100644
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -81,4 +81,8 @@
                                                     cl_context context,
                                                     cl_command_queue queue,
                                                     int num_elements);
+extern int test_subgroup_functions_rotate(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);
 #endif /*_procs_h*/
diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp
index f8b2445..33a5163 100644
--- a/test_conformance/subgroups/subgroup_common_kernels.cpp
+++ b/test_conformance/subgroups/subgroup_common_kernels.cpp
@@ -15,92 +15,20 @@
 #include "subgroup_common_kernels.h"
-const char* bcast_source =
-    "__kernel void test_bcast(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint which_sub_group_local_id = xy[gid].z;\n"
-    "    out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
-    "}\n";
+std::string sub_group_reduction_scan_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        out[gid] = %s(in[gid]);
+    }
-const char* redadd_source = "__kernel void test_redadd(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_add(in[gid]);\n"
-                            "}\n";
-const char* redmax_source = "__kernel void test_redmax(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_max(in[gid]);\n"
-                            "}\n";
-const char* redmin_source = "__kernel void test_redmin(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_min(in[gid]);\n"
-                            "}\n";
-const char* scinadd_source =
-    "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
-    "}\n";
-const char* scinmax_source =
-    "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
-    "}\n";
-const char* scinmin_source =
-    "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
-    "}\n";
-const char* scexadd_source =
-    "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
-    "}\n";
-const char* scexmax_source =
-    "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
-    "}\n";
-const char* scexmin_source =
-    "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
-    "}\n";
+std::string sub_group_generic_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        Type x = in[gid];
+        out[gid] = %s(x, xy[gid].z);
+    }
\ No newline at end of file
diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h
index 8ae97d9..bf2210e 100644
--- a/test_conformance/subgroups/subgroup_common_kernels.h
+++ b/test_conformance/subgroups/subgroup_common_kernels.h
@@ -18,15 +18,7 @@
 #include "subhelpers.h"
-extern const char* bcast_source;
-extern const char* redadd_source;
-extern const char* redmax_source;
-extern const char* redmin_source;
-extern const char* scinadd_source;
-extern const char* scinmax_source;
-extern const char* scinmin_source;
-extern const char* scexadd_source;
-extern const char* scexmax_source;
-extern const char* scexmin_source;
+extern std::string sub_group_reduction_scan_source;
+extern std::string sub_group_generic_source;
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index b30c416..b2648c3 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -17,13 +17,12 @@
 #include "typeWrappers.h"
-#include <bitset>
 #include "CL/cl_half.h"
 #include "subhelpers.h"
 #include <set>
+#include <algorithm>
+#include <random>
-typedef std::bitset<128> bs128;
 static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
                                   const std::string &mask_type,
                                   cl_uint max_sub_group_size)
@@ -66,6 +65,13 @@
 // only 4 work_items from subgroup enter the code (are active)
 template <typename Ty, SubgroupsBroadcastOp operation> struct BC
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int i, ii, j, k, n;
@@ -79,11 +85,8 @@
         int last_subgroup_size = 0;
         ii = 0;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         if (non_uniform_size)
-            log_info("  non uniform work group size mode ON\n");
         for (k = 0; k < ng; ++k)
@@ -172,8 +175,8 @@
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int ii, i, j, k, l, n;
         int ng = test_params.global_workgroup_size;
@@ -290,8 +293,6 @@
             y += nw;
             m += 4 * nw;
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
@@ -301,7 +302,7 @@
 static subgroups::cl_half to_half(float x)
     subgroups::cl_half value;
- = cl_half_from_float(x, CL_HALF_RTE);
+ = cl_half_from_float(x, g_rounding_mode);
     return value;
@@ -320,7 +321,7 @@
         case ArithmeticOp::logical_and: return a && b;
         case ArithmeticOp::logical_or: return a || b;
         case ArithmeticOp::logical_xor: return !a ^ !b;
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return 0;
@@ -342,7 +343,7 @@
         case ArithmeticOp::mul_: {
             return a * b;
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return 0;
@@ -364,7 +365,7 @@
         case ArithmeticOp::mul_: {
             return a * b;
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return 0;
@@ -381,7 +382,7 @@
         case ArithmeticOp::min_:
             return to_float(a) < to_float(b) || is_half_nan( ? a : b;
         case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b));
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return to_half(0);
@@ -392,11 +393,44 @@
         || std::is_same<Ty, subgroups::cl_half>::value;
+// limit possible input values to avoid arithmetic rounding/overflow issues.
+// for each subgroup values defined different values
+// for rest of workitems set 1
+// shuffle values
+static void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
+                                         int sb_size)
+    // max product is 720, cl_half has enough precision for it
+    const std::vector<cl_ulong> non_one_values{ 2, 3, 4, 5, 6 };
+    if (sb_size <= non_one_values.size())
+    {
+        safe_values.assign(non_one_values.begin(),
+                           non_one_values.begin() + sb_size);
+    }
+    else
+    {
+        safe_values.assign(sb_size, 1);
+        std::copy(non_one_values.begin(), non_one_values.end(),
+                  safe_values.begin());
+    }
+    std::mt19937 mersenne_twister_engine(10000);
+    std::shuffle(safe_values.begin(), safe_values.end(),
+                 mersenne_twister_engine);
 template <typename Ty, ArithmeticOp operation>
-void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+void generate_inputs(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
     int nj = (nw + ns - 1) / ns;
+    std::vector<cl_ulong> safe_values;
+    if (operation == ArithmeticOp::mul_ || operation == ArithmeticOp::add_)
+    {
+        fill_and_shuffle_safe_values(safe_values, ns);
+    }
     for (int k = 0; k < ng; ++k)
         for (int j = 0; j < nj; ++j)
@@ -407,13 +441,10 @@
             for (int i = 0; i < n; ++i)
                 cl_ulong out_value;
-                double y;
                 if (operation == ArithmeticOp::mul_
                     || operation == ArithmeticOp::add_)
-                    // work around to avoid overflow, do not use 0 for
-                    // multiplication
-                    out_value = (genrand_int32(gMTdata) % 4) + 1;
+                    out_value = safe_values[i];
@@ -441,18 +472,23 @@
 template <typename Ty, ShuffleOp operation> struct SHF
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
-        int i, ii, j, k, l, n, delta;
+        int i, ii, j, k, n;
+        cl_uint l;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
-        int d = ns > 100 ? 100 : ns;
         ii = 0;
         ng = ng / nw;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         for (k = 0; k < ng; ++k)
         { // for each work_group
             for (j = 0; j < nj; ++j)
@@ -462,30 +498,31 @@
                 for (i = 0; i < n; ++i)
                     int midx = 4 * ii + 4 * i + 2;
-                    l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
-                        % (d > n ? n : d);
+                    l = (((cl_uint)(genrand_int32(gMTdata) & 0x7fffffff) + 1)
+                         % (ns * 2 + 1))
+                        - 1;
                     switch (operation)
                         case ShuffleOp::shuffle:
                         case ShuffleOp::shuffle_xor:
-                            // storing information about shuffle index
+                        case ShuffleOp::shuffle_up:
+                        case ShuffleOp::shuffle_down:
+                            // storing information about shuffle index/delta
                             m[midx] = (cl_int)l;
-                        case ShuffleOp::shuffle_up:
-                            delta = l; // calculate delta for shuffle up
-                            if (i - delta < 0)
+                        case ShuffleOp::rotate:
+                        case ShuffleOp::clustered_rotate:
+                            // Storing information about rotate delta.
+                            // The delta must be the same for each thread in
+                            // the subgroup.
+                            if (i == 0)
-                                delta = i;
+                                m[midx] = (cl_int)l;
-                            m[midx] = (cl_int)delta;
-                            break;
-                        case ShuffleOp::shuffle_down:
-                            delta = l; // calculate delta for shuffle down
-                            if (i + delta >= n)
+                            else
-                                delta = n - 1 - i;
+                                m[midx] = m[midx - 4];
-                            m[midx] = (cl_int)delta;
                         default: break;
@@ -503,10 +540,11 @@
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
-        int ii, i, j, k, l, n;
+        int ii, i, j, k, n;
+        cl_uint l;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
@@ -531,32 +569,51 @@
                 { // inside the subgroup
                   // shuffle index storage
                     int midx = 4 * ii + 4 * i + 2;
-                    l = (int)m[midx];
+                    l = m[midx];
                     rr = my[ii + i];
+                    cl_uint tr_idx;
+                    bool skip = false;
                     switch (operation)
                         // shuffle basic - treat l as index
-                        case ShuffleOp::shuffle: tr = mx[ii + l]; break;
-                        // shuffle up - treat l as delta
-                        case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break;
-                        // shuffle up - treat l as delta
-                        case ShuffleOp::shuffle_down:
-                            tr = mx[ii + i + l];
-                            break;
+                        case ShuffleOp::shuffle: tr_idx = l; break;
                         // shuffle xor - treat l as mask
-                        case ShuffleOp::shuffle_xor:
-                            tr = mx[ii + (i ^ l)];
+                        case ShuffleOp::shuffle_xor: tr_idx = i ^ l; break;
+                        // shuffle up - treat l as delta
+                        case ShuffleOp::shuffle_up:
+                            if (l >= ns) skip = true;
+                            tr_idx = i - l;
+                        // shuffle down - treat l as delta
+                        case ShuffleOp::shuffle_down:
+                            if (l >= ns) skip = true;
+                            tr_idx = i + l;
+                            break;
+                        // rotate - treat l as delta
+                        case ShuffleOp::rotate:
+                            tr_idx = (i + l) % test_params.subgroup_size;
+                            break;
+                        case ShuffleOp::clustered_rotate: {
+                            tr_idx = ((i & ~(test_params.cluster_size - 1))
+                                      + ((i + l) % test_params.cluster_size));
+                            break;
+                        }
                         default: break;
-                    if (!compare(rr, tr))
+                    if (!skip && tr_idx < n)
-                        log_error("ERROR: sub_group_%s(%s) mismatch for "
-                                  "local id %d in sub group %d in group %d\n",
-                                  operation_names(operation),
-                                  TypeManager<Ty>::name(), i, j, k);
-                        return TEST_FAIL;
+                        tr = mx[ii + tr_idx];
+                        if (!compare(rr, tr))
+                        {
+                            log_error("ERROR: sub_group_%s(%s) mismatch for "
+                                      "local id %d in sub group %d in group "
+                                      "%d\n",
+                                      operation_names(operation),
+                                      TypeManager<Ty>::name(), i, j, k);
+                            return TEST_FAIL;
+                        }
@@ -564,51 +621,53 @@
             y += nw;
             m += 4 * nw;
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
 template <typename Ty, ArithmeticOp operation> struct SCEX_NU
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_exclusive"
+                                     : "sub_group_scan_exclusive");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
-                        : func_name = "sub_group_scan_exclusive";
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
-                        : func_name = "sub_group_scan_exclusive";
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_exclusive"
+                                     : "sub_group_scan_exclusive");
-        uint32_t use_work_items_mask;
         // for uniform case take into consideration all workitems
-        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        if (!work_items_mask.any())
+        {
+            work_items_mask.set();
+        }
         for (k = 0; k < ng; ++k)
         { // for each work_group
             // Map to array indexed to array indexed by local ID and sub group
@@ -624,35 +683,21 @@
                 std::set<int> active_work_items;
                 for (i = 0; i < n; ++i)
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                 if (active_work_items.empty())
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
-                    continue;
-                }
-                else if (active_work_items.size() == 1)
-                {
-                    log_info("  One active workitem in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
                     tr = TypeManager<Ty>::identify_limits(operation);
-                    int idx = 0;
                     for (const int &active_work_item : active_work_items)
                         rr = my[ii + active_work_item];
-                        if (idx == 0) continue;
                         if (!compare_ordered(rr, tr))
@@ -665,7 +710,6 @@
                         tr = calculate<Ty>(tr, mx[ii + active_work_item],
-                        idx++;
@@ -674,8 +718,6 @@
             m += 4 * nw;
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
@@ -683,44 +725,48 @@
 // Test for scan inclusive non uniform functions
 template <typename Ty, ArithmeticOp operation> struct SCIN_NU
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_inclusive"
+                                     : "sub_group_scan_inclusive");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
-                        : func_name = "sub_group_scan_inclusive";
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
-                        : func_name = "sub_group_scan_inclusive";
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_inclusive"
+                                     : "sub_group_scan_inclusive");
-        uint32_t use_work_items_mask;
         // for uniform case take into consideration all workitems
-        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        if (!work_items_mask.any())
+        {
+            work_items_mask.set();
+        }
         // std::bitset<32> mask32(use_work_items_mask);
         // for (int k) mask32.count();
         for (k = 0; k < ng; ++k)
@@ -740,8 +786,7 @@
                 for (i = 0; i < n; ++i)
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                         if (catch_frist_active == -1)
@@ -752,9 +797,6 @@
                 if (active_work_items.empty())
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
@@ -792,8 +834,6 @@
             m += 4 * nw;
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
@@ -801,41 +841,41 @@
 // Test for reduce non uniform functions
 template <typename Ty, ArithmeticOp operation> struct RED_NU
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_reduce"
+                                     : "sub_group_reduce");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
-                        : func_name = "sub_group_reduce";
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         ng = ng / nw;
         Ty tr, rr;
-        std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
-                        : func_name = "sub_group_reduce";
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_reduce"
+                                     : "sub_group_reduce");
         for (k = 0; k < ng; ++k)
@@ -847,9 +887,10 @@
                 my[j] = y[j];
-            uint32_t use_work_items_mask;
-            use_work_items_mask =
-                !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+            if (!work_items_mask.any())
+            {
+                work_items_mask.set();
+            }
             for (j = 0; j < nj; ++j)
@@ -859,8 +900,7 @@
                 int catch_frist_active = -1;
                 for (i = 0; i < n; ++i)
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                         if (catch_frist_active == -1)
@@ -876,9 +916,6 @@
                 if (active_work_items.empty())
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
@@ -902,8 +939,6 @@
             m += 4 * nw;
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 93673b3..0a2c390 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -24,32 +24,194 @@
 #include <limits>
 #include <vector>
 #include <type_traits>
+#include <bitset>
+#include <regex>
+#include <map>
 extern MTdata gMTdata;
+typedef std::bitset<128> bs128;
+extern cl_half_rounding_mode g_rounding_mode;
+static bs128 cl_uint4_to_bs128(cl_uint4 v)
+    return bs128(v.s0) | (bs128(v.s1) << 32) | (bs128(v.s2) << 64)
+        | (bs128(v.s3) << 96);
+static cl_uint4 bs128_to_cl_uint4(bs128 v)
+    bs128 bs128_ffffffff = 0xffffffffU;
+    cl_uint4 r;
+    r.s0 = ((v >> 0) & bs128_ffffffff).to_ulong();
+    r.s1 = ((v >> 32) & bs128_ffffffff).to_ulong();
+    r.s2 = ((v >> 64) & bs128_ffffffff).to_ulong();
+    r.s3 = ((v >> 96) & bs128_ffffffff).to_ulong();
+    return r;
 struct WorkGroupParams
-    WorkGroupParams(size_t gws, size_t lws,
-                    const std::vector<std::string> &req_ext = {},
-                    const std::vector<uint32_t> &all_wim = {})
+    WorkGroupParams(size_t gws, size_t lws, int dm_arg = -1, int cs_arg = -1)
         : global_workgroup_size(gws), local_workgroup_size(lws),
-          required_extensions(req_ext), all_work_item_masks(all_wim)
+          divergence_mask_arg(dm_arg), cluster_size_arg(cs_arg)
         subgroup_size = 0;
+        cluster_size = 0;
         work_items_mask = 0;
         use_core_subgroups = true;
         dynsc = 0;
+        load_masks();
     size_t global_workgroup_size;
     size_t local_workgroup_size;
     size_t subgroup_size;
-    uint32_t work_items_mask;
-    int dynsc;
+    cl_uint cluster_size;
+    bs128 work_items_mask;
+    size_t dynsc;
     bool use_core_subgroups;
-    std::vector<std::string> required_extensions;
-    std::vector<uint32_t> all_work_item_masks;
+    std::vector<bs128> all_work_item_masks;
+    int divergence_mask_arg;
+    int cluster_size_arg;
+    void save_kernel_source(const std::string &source, std::string name = "")
+    {
+        if (name == "")
+        {
+            name = "default";
+        }
+        if (kernel_function_name.find(name) != kernel_function_name.end())
+        {
+            log_info("Kernel definition duplication. Source will be "
+                     "overwritten for function name %s\n",
+                     name.c_str());
+        }
+        kernel_function_name[name] = source;
+    };
+    // return specific defined kernel or default.
+    std::string get_kernel_source(std::string name)
+    {
+        if (kernel_function_name.find(name) == kernel_function_name.end())
+        {
+            return kernel_function_name["default"];
+        }
+        return kernel_function_name[name];
+    }
+    std::map<std::string, std::string> kernel_function_name;
+    void load_masks()
+    {
+        if (divergence_mask_arg != -1)
+        {
+            // 1 in string will be set 1, 0 will be set 0
+            bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xf0f0f0f0);
+            // 1 in string will be set 0, 0 will be set 1
+            bs128 mask_0x0f0f0f0f("11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000",
+                                  128, '1', '0');
+            all_work_item_masks.push_back(mask_0x0f0f0f0f);
+            bs128 mask_0x5555aaaa("10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x5555aaaa);
+            bs128 mask_0xaaaa5555("10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010",
+                                  128, '1', '0');
+            all_work_item_masks.push_back(mask_0xaaaa5555);
+            // 0x0f0ff0f0
+            bs128 mask_0x0f0ff0f0("00001111000011111111000011110000"
+                                  "00001111000011111111000011110000"
+                                  "00001111000011111111000011110000"
+                                  "00001111000011111111000011110000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x0f0ff0f0);
+            // 0xff0000ff
+            bs128 mask_0xff0000ff("11111111000000000000000011111111"
+                                  "11111111000000000000000011111111"
+                                  "11111111000000000000000011111111"
+                                  "11111111000000000000000011111111",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xff0000ff);
+            // 0xff00ff00
+            bs128 mask_0xff00ff00("11111111000000001111111100000000"
+                                  "11111111000000001111111100000000"
+                                  "11111111000000001111111100000000"
+                                  "11111111000000001111111100000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xff00ff00);
+            // 0x00ffff00
+            bs128 mask_0x00ffff00("00000000111111111111111100000000"
+                                  "00000000111111111111111100000000"
+                                  "00000000111111111111111100000000"
+                                  "00000000111111111111111100000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x00ffff00);
+            // 0x80 1 workitem highest id for 8 subgroup size
+            bs128 mask_0x80808080("10000000100000001000000010000000"
+                                  "10000000100000001000000010000000"
+                                  "10000000100000001000000010000000"
+                                  "10000000100000001000000010000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x80808080);
+            // 0x8000 1 workitem highest id for 16 subgroup size
+            bs128 mask_0x80008000("10000000000000001000000000000000"
+                                  "10000000000000001000000000000000"
+                                  "10000000000000001000000000000000"
+                                  "10000000000000001000000000000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x80008000);
+            // 0x80000000 1 workitem highest id for 32 subgroup size
+            bs128 mask_0x80000000("10000000000000000000000000000000"
+                                  "10000000000000000000000000000000"
+                                  "10000000000000000000000000000000"
+                                  "10000000000000000000000000000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x80000000);
+            // 0x80000000 00000000 1 workitem highest id for 64 subgroup size
+            // 0x80000000 1 workitem highest id for 32 subgroup size
+            bs128 mask_0x8000000000000000("10000000000000000000000000000000"
+                                          "00000000000000000000000000000000"
+                                          "10000000000000000000000000000000"
+                                          "00000000000000000000000000000000",
+                                          128, '0', '1');
+            all_work_item_masks.push_back(mask_0x8000000000000000);
+            // 0x80000000 00000000 00000000 00000000 1 workitem highest id for
+            // 128 subgroup size
+            bs128 mask_0x80000000000000000000000000000000(
+                "10000000000000000000000000000000"
+                "00000000000000000000000000000000"
+                "00000000000000000000000000000000"
+                "00000000000000000000000000000000",
+                128, '0', '1');
+            all_work_item_masks.push_back(
+                mask_0x80000000000000000000000000000000);
+            bs128 mask_0xffffffff("11111111111111111111111111111111"
+                                  "11111111111111111111111111111111"
+                                  "11111111111111111111111111111111"
+                                  "11111111111111111111111111111111",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xffffffff);
+        }
+    }
 enum class SubgroupsBroadcastOp
@@ -89,7 +251,9 @@
-    shuffle_xor
+    shuffle_xor,
+    rotate,
+    clustered_rotate,
 enum class ArithmeticOp
@@ -120,7 +284,7 @@
         case ArithmeticOp::logical_and: return "logical_and";
         case ArithmeticOp::logical_or: return "logical_or";
         case ArithmeticOp::logical_xor: return "logical_xor";
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return "";
@@ -142,7 +306,7 @@
         case BallotOp::gt_mask: return "gt";
         case BallotOp::le_mask: return "le";
         case BallotOp::lt_mask: return "lt";
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return "";
@@ -155,7 +319,9 @@
         case ShuffleOp::shuffle_up: return "shuffle_up";
         case ShuffleOp::shuffle_down: return "shuffle_down";
         case ShuffleOp::shuffle_xor: return "shuffle_xor";
-        default: log_error("Unknown operation request"); break;
+        case ShuffleOp::rotate: return "rotate";
+        case ShuffleOp::clustered_rotate: return "clustered_rotate";
+        default: log_error("Unknown operation request\n"); break;
     return "";
@@ -168,7 +334,7 @@
         case NonUniformVoteOp::all_equal: return "all_equal";
         case NonUniformVoteOp::any: return "any";
         case NonUniformVoteOp::elect: return "elect";
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return "";
@@ -181,7 +347,7 @@
         case SubgroupsBroadcastOp::broadcast_first: return "broadcast_first";
         case SubgroupsBroadcastOp::non_uniform_broadcast:
             return "non_uniform_broadcast";
-        default: log_error("Unknown operation request"); break;
+        default: log_error("Unknown operation request\n"); break;
     return "";
@@ -358,7 +524,7 @@
             case ArithmeticOp::and_: return (Ty)~0;
             case ArithmeticOp::or_: return (Ty)0;
             case ArithmeticOp::xor_: return (Ty)0;
-            default: log_error("Unknown operation request"); break;
+            default: log_error("Unknown operation request\n"); break;
         return 0;
@@ -386,7 +552,7 @@
             case ArithmeticOp::logical_and: return (cl_int)1;
             case ArithmeticOp::logical_or: return (cl_int)0;
             case ArithmeticOp::logical_xor: return (cl_int)0;
-            default: log_error("Unknown operation request"); break;
+            default: log_error("Unknown operation request\n"); break;
         return 0;
@@ -800,7 +966,7 @@
             case ArithmeticOp::min_:
                 return std::numeric_limits<float>::infinity();
             case ArithmeticOp::mul_: return (cl_float)1;
-            default: log_error("Unknown operation request"); break;
+            default: log_error("Unknown operation request\n"); break;
         return 0;
@@ -859,7 +1025,7 @@
             case ArithmeticOp::min_:
                 return std::numeric_limits<double>::infinity();
             case ArithmeticOp::mul_: return (cl_double)1;
-            default: log_error("Unknown operation request"); break;
+            default: log_error("Unknown operation request\n"); break;
         return 0;
@@ -946,7 +1112,7 @@
             case ArithmeticOp::max_: return { 0xfc00 };
             case ArithmeticOp::min_: return { 0x7c00 };
             case ArithmeticOp::mul_: return { 0x3c00 };
-            default: log_error("Unknown operation request"); break;
+            default: log_error("Unknown operation request\n"); break;
         return { 0 };
@@ -1080,7 +1246,7 @@
 typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value>::type
 set_value(Ty &lhs, const cl_ulong &rhs)
- = rhs;
+ = cl_half_from_float(static_cast<cl_float>(rhs), g_rounding_mode);
 // compare for common vectors
@@ -1164,98 +1330,172 @@
     return cl_half_to_float( == rhs;
-// Run a test kernel to compute the result of a built-in on an input
-static int run_kernel(cl_context context, cl_command_queue queue,
-                      cl_kernel kernel, size_t global, size_t local,
-                      void *idata, size_t isize, void *mdata, size_t msize,
-                      void *odata, size_t osize, size_t tsize = 0)
-    clMemWrapper in;
-    clMemWrapper xy;
-    clMemWrapper out;
-    clMemWrapper tmp;
-    int error;
-    in = clCreateBuffer(context, CL_MEM_READ_ONLY, isize, NULL, &error);
-    test_error(error, "clCreateBuffer failed");
-    xy = clCreateBuffer(context, CL_MEM_WRITE_ONLY, msize, NULL, &error);
-    test_error(error, "clCreateBuffer failed");
-    out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, osize, NULL, &error);
-    test_error(error, "clCreateBuffer failed");
-    if (tsize)
+template <typename Ty, typename Fns> class KernelExecutor {
+    KernelExecutor(cl_context c, cl_command_queue q, cl_kernel k, size_t g,
+                   size_t l, Ty *id, size_t is, Ty *mid, Ty *mod, cl_int *md,
+                   size_t ms, Ty *od, size_t os, size_t ts = 0)
+        : context(c), queue(q), kernel(k), global(g), local(l), idata(id),
+          isize(is), mapin_data(mid), mapout_data(mod), mdata(md), msize(ms),
+          odata(od), osize(os), tsize(ts)
-        tmp = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
-                             tsize, NULL, &error);
+        has_status = false;
+        run_failed = false;
+    }
+    cl_context context;
+    cl_command_queue queue;
+    cl_kernel kernel;
+    size_t global;
+    size_t local;
+    Ty *idata;
+    size_t isize;
+    Ty *mapin_data;
+    Ty *mapout_data;
+    cl_int *mdata;
+    size_t msize;
+    Ty *odata;
+    size_t osize;
+    size_t tsize;
+    bool run_failed;
+    bool has_status;
+    test_status status;
+    // Run a test kernel to compute the result of a built-in on an input
+    int run()
+    {
+        clMemWrapper in;
+        clMemWrapper xy;
+        clMemWrapper out;
+        clMemWrapper tmp;
+        int error;
+        in = clCreateBuffer(context, CL_MEM_READ_ONLY, isize, NULL, &error);
         test_error(error, "clCreateBuffer failed");
-    }
-    error = clSetKernelArg(kernel, 0, sizeof(in), (void *)&in);
-    test_error(error, "clSetKernelArg failed");
+        xy = clCreateBuffer(context, CL_MEM_WRITE_ONLY, msize, NULL, &error);
+        test_error(error, "clCreateBuffer failed");
-    error = clSetKernelArg(kernel, 1, sizeof(xy), (void *)&xy);
-    test_error(error, "clSetKernelArg failed");
+        out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, osize, NULL, &error);
+        test_error(error, "clCreateBuffer failed");
-    error = clSetKernelArg(kernel, 2, sizeof(out), (void *)&out);
-    test_error(error, "clSetKernelArg failed");
+        if (tsize)
+        {
+            tmp = clCreateBuffer(context,
+                                 CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+                                 tsize, NULL, &error);
+            test_error(error, "clCreateBuffer failed");
+        }
-    if (tsize)
-    {
-        error = clSetKernelArg(kernel, 3, sizeof(tmp), (void *)&tmp);
+        error = clSetKernelArg(kernel, 0, sizeof(in), (void *)&in);
         test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel, 1, sizeof(xy), (void *)&xy);
+        test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel, 2, sizeof(out), (void *)&out);
+        test_error(error, "clSetKernelArg failed");
+        if (tsize)
+        {
+            error = clSetKernelArg(kernel, 3, sizeof(tmp), (void *)&tmp);
+            test_error(error, "clSetKernelArg failed");
+        }
+        error = clEnqueueWriteBuffer(queue, in, CL_FALSE, 0, isize, idata, 0,
+                                     NULL, NULL);
+        test_error(error, "clEnqueueWriteBuffer failed");
+        error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0,
+                                     NULL, NULL);
+        test_error(error, "clEnqueueWriteBuffer failed");
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local,
+                                       0, NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+        error = clEnqueueReadBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0,
+                                    NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
+        error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, osize, odata, 0,
+                                    NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        return error;
-    error = clEnqueueWriteBuffer(queue, in, CL_FALSE, 0, isize, idata, 0, NULL,
-                                 NULL);
-    test_error(error, "clEnqueueWriteBuffer failed");
+    test_status
+    run_and_check_with_cluster_size(const WorkGroupParams &test_params)
+    {
+        cl_int error = run();
+        if (error != CL_SUCCESS)
+        {
+            print_error(error, "Failed to run subgroup test kernel");
+            status = TEST_FAIL;
+            run_failed = true;
+            return status;
+        }
-    error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL,
-                                 NULL);
-    test_error(error, "clEnqueueWriteBuffer failed");
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
-                                   NULL, NULL);
-    test_error(error, "clEnqueueNDRangeKernel failed");
+        test_status tmp_status =
+            Fns::chk(idata, odata, mapin_data, mapout_data, mdata, test_params);
-    error = clEnqueueReadBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL,
-                                NULL);
-    test_error(error, "clEnqueueReadBuffer failed");
+        if (!has_status || tmp_status == TEST_FAIL
+            || (tmp_status == TEST_PASS && status != TEST_FAIL))
+        {
+            status = tmp_status;
+            has_status = true;
+        }
-    error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, osize, odata, 0, NULL,
-                                NULL);
-    test_error(error, "clEnqueueReadBuffer failed");
+        return status;
+    }
-    error = clFinish(queue);
-    test_error(error, "clFinish failed");
+    test_status run_and_check(WorkGroupParams &test_params)
+    {
+        test_status tmp_status = TEST_SKIPPED_ITSELF;
-    return error;
+        if (test_params.cluster_size_arg != -1)
+        {
+            for (cl_uint cluster_size = 1;
+                 cluster_size <= test_params.subgroup_size; cluster_size *= 2)
+            {
+                test_params.cluster_size = cluster_size;
+                cl_int error =
+                    clSetKernelArg(kernel, test_params.cluster_size_arg,
+                                   sizeof(cl_uint), &cluster_size);
+                test_error_fail(error, "Unable to set cluster size");
+                tmp_status = run_and_check_with_cluster_size(test_params);
+                if (tmp_status == TEST_FAIL) break;
+            }
+        }
+        else
+        {
+            tmp_status = run_and_check_with_cluster_size(test_params);
+        }
+        return tmp_status;
+    }
 // Driver for testing a single built in function
 template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
-    static int mrun(cl_device_id device, cl_context context,
-                    cl_command_queue queue, int num_elements, const char *kname,
-                    const char *src, WorkGroupParams test_params)
-    {
-        int error = TEST_PASS;
-        for (auto &mask : test_params.all_work_item_masks)
-        {
-            test_params.work_items_mask = mask;
-            error |= run(device, context, queue, num_elements, kname, src,
-                         test_params);
-        }
-        return error;
-    };
-    static int run(cl_device_id device, cl_context context,
-                   cl_command_queue queue, int num_elements, const char *kname,
-                   const char *src, WorkGroupParams test_params)
+    static test_status run(cl_device_id device, cl_context context,
+                           cl_command_queue queue, int num_elements,
+                           const char *kname, const char *src,
+                           WorkGroupParams test_params)
         size_t tmp;
-        int error;
-        int subgroup_size, num_subgroups;
-        size_t realSize;
+        cl_int error;
+        size_t subgroup_size, num_subgroups;
         size_t global = test_params.global_workgroup_size;
         size_t local = test_params.local_workgroup_size;
         clProgramWrapper program;
@@ -1268,13 +1508,8 @@
         std::vector<Ty> mapout;
         std::stringstream kernel_sstr;
-        if (test_params.work_items_mask != 0)
-        {
-            kernel_sstr << "#define WORK_ITEMS_MASK ";
-            kernel_sstr << "0x" << std::hex << test_params.work_items_mask
-                        << "\n";
-        }
+        Fns::log_test(test_params, "");
         kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS ";
         kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n";
@@ -1282,36 +1517,21 @@
         if (!TypeManager<Ty>::type_supported(device))
             log_info("Data type not supported : %s\n", TypeManager<Ty>::name());
-            return 0;
-        }
-        else
-        {
-            if (strstr(TypeManager<Ty>::name(), "double"))
-            {
-                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
-            }
-            else if (strstr(TypeManager<Ty>::name(), "half"))
-            {
-                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
-            }
+            return TEST_SKIPPED_ITSELF;
-        for (std::string extension : test_params.required_extensions)
+        if (strstr(TypeManager<Ty>::name(), "double"))
-            if (!is_extension_available(device, extension.c_str()))
-            {
-                log_info("The extension %s not supported on this device. SKIP "
-                         "testing - kernel %s data type %s\n",
-                         extension.c_str(), kname, TypeManager<Ty>::name());
-                return TEST_PASS;
-            }
-            kernel_sstr << "#pragma OPENCL EXTENSION " + extension
-                    + ": enable\n";
+            kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
+        }
+        else if (strstr(TypeManager<Ty>::name(), "half"))
+        {
+            kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
                                 (void *)&platform, NULL);
-        test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
+        test_error_fail(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
         if (test_params.use_core_subgroups)
@@ -1326,12 +1546,12 @@
         error = create_single_kernel_helper(context, &program, &kernel, 1,
                                             &kernel_src, kname);
-        if (error != 0) return error;
+        if (error != CL_SUCCESS) return TEST_FAIL;
         // Determine some local dimensions to use for the test.
         error = get_max_common_work_group_size(
             context, kernel, test_params.global_workgroup_size, &local);
-        test_error(error, "get_max_common_work_group_size failed");
+        test_error_fail(error, "get_max_common_work_group_size failed");
         // Limit it a bit so we have muliple work groups
         // Ideally this will still be large enough to give us multiple
@@ -1345,7 +1565,7 @@
         if (clGetKernelSubGroupInfo_ptr == NULL)
-            log_error("ERROR: %s function not available",
+            log_error("ERROR: %s function not available\n",
             return TEST_FAIL;
@@ -1355,12 +1575,12 @@
         if (error != CL_SUCCESS)
             log_error("ERROR: %s function error for "
-                      "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE",
+                      "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE\n",
             return TEST_FAIL;
-        subgroup_size = (int)tmp;
+        subgroup_size = tmp;
         error = clGetKernelSubGroupInfo_ptr(
             kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
@@ -1368,16 +1588,16 @@
         if (error != CL_SUCCESS)
             log_error("ERROR: %s function error for "
-                      "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE",
+                      "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE\n",
             return TEST_FAIL;
-        num_subgroups = (int)tmp;
+        num_subgroups = tmp;
         // Make sure the number of sub groups is what we expect
         if (num_subgroups != (local + subgroup_size - 1) / subgroup_size)
-            log_error("ERROR: unexpected number of subgroups (%d) returned\n",
+            log_error("ERROR: unexpected number of subgroups (%zu) returned\n",
             return TEST_FAIL;
@@ -1386,41 +1606,83 @@
         std::vector<Ty> odata;
         size_t input_array_size = global;
         size_t output_array_size = global;
-        int dynscl = test_params.dynsc;
+        size_t dynscl = test_params.dynsc;
         if (dynscl != 0)
-            input_array_size =
-                (int)global / (int)local * num_subgroups * dynscl;
-            output_array_size = (int)global / (int)local * dynscl;
+            input_array_size = global / local * num_subgroups * dynscl;
+            output_array_size = global / local * dynscl;
+        if (test_params.divergence_mask_arg != -1)
+        {
+            cl_uint4 mask_vector;
+            mask_vector.x = 0xffffffffU;
+            mask_vector.y = 0xffffffffU;
+            mask_vector.z = 0xffffffffU;
+            mask_vector.w = 0xffffffffU;
+            error = clSetKernelArg(kernel, test_params.divergence_mask_arg,
+                                   sizeof(cl_uint4), &mask_vector);
+            test_error_fail(error, "Unable to set divergence mask argument");
+        }
+        if (test_params.cluster_size_arg != -1)
+        {
+            cl_uint dummy_cluster_size = 1;
+            error = clSetKernelArg(kernel, test_params.cluster_size_arg,
+                                   sizeof(cl_uint), &dummy_cluster_size);
+            test_error_fail(error, "Unable to set dummy cluster size");
+        }
+        KernelExecutor<Ty, Fns> executor(
+            context, queue, kernel, global, local,,
+            input_array_size * sizeof(Ty),,,
+  , global * sizeof(cl_int4),,
+            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
         // Run the kernel once on zeroes to get the map
         memset(, 0, input_array_size * sizeof(Ty));
-        error = run_kernel(context, queue, kernel, global, local,,
-                           input_array_size * sizeof(Ty),,
-                           global * sizeof(cl_int4),,
-                           output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        test_error(error, "Running kernel first time failed");
+        error =;
+        test_error_fail(error, "Running kernel first time failed");
         // Generate the desired input for the kernel
         test_params.subgroup_size = subgroup_size;
         Fns::gen(,,, test_params);
-        error = run_kernel(context, queue, kernel, global, local,,
-                           input_array_size * sizeof(Ty),,
-                           global * sizeof(cl_int4),,
-                           output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        test_error(error, "Running kernel second time failed");
-        // Check the result
-        error = Fns::chk(,,,
-               ,, test_params);
-        test_error(error, "Data verification failed");
-        return TEST_PASS;
+        test_status status;
+        if (test_params.divergence_mask_arg != -1)
+        {
+            for (auto &mask : test_params.all_work_item_masks)
+            {
+                test_params.work_items_mask = mask;
+                cl_uint4 mask_vector = bs128_to_cl_uint4(mask);
+                clSetKernelArg(kernel, test_params.divergence_mask_arg,
+                               sizeof(cl_uint4), &mask_vector);
+                status = executor.run_and_check(test_params);
+                if (status == TEST_FAIL) break;
+            }
+        }
+        else
+        {
+            status = executor.run_and_check(test_params);
+        }
+        // Detailed failure and skip messages should be logged by
+        // run_and_check.
+        if (status == TEST_PASS)
+        {
+            Fns::log_test(test_params, " passed");
+        }
+        else if (!executor.run_failed && status == TEST_FAIL)
+        {
+            test_fail("Data verification failed\n");
+        }
+        return status;
@@ -1466,21 +1728,21 @@
           num_elements_(num_elements), test_params_(test_params)
     template <typename T, typename U>
-    int run_impl(const char *kernel_name, const char *source)
+    int run_impl(const std::string &function_name)
         int error = TEST_PASS;
-        if (test_params_.all_work_item_masks.size() > 0)
-        {
-            error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
-                                     kernel_name, source, test_params_);
-        }
-        else
-        {
-            error = test<T, U>::run(device_, context_, queue_, num_elements_,
-                                    kernel_name, source, test_params_);
-        }
+        std::string source =
+            std::regex_replace(test_params_.get_kernel_source(function_name),
+                               std::regex("\\%s"), function_name);
+        std::string kernel_name = "test_" + function_name;
+        error =
+            test<T, U>::run(device_, context_, queue_, num_elements_,
+                            kernel_name.c_str(), source.c_str(), test_params_);
-        return error;
+        // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be
+        // reported as having been skipped even if some tests within it
+        // passed, as the status codes are erroneously ORed together:
+        return error == TEST_FAIL ? TEST_FAIL : TEST_PASS;
diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp
index 47e42f6..fb93ddb 100644
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -59,6 +59,17 @@
 // barrier test functions
 template <int Which> struct BAR
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        if (Which == 0)
+            log_info("  sub_group_barrier(CLK_LOCAL_MEM_FENCE)...%s\n",
+                     extra_text);
+        else
+            log_info("  sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...%s\n",
+                     extra_text);
+    }
     static void gen(cl_int *x, cl_int *t, cl_int *m,
                     const WorkGroupParams &test_params)
@@ -68,7 +79,6 @@
         int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
         ng = ng / nw;
-        int e;
         ii = 0;
         for (k = 0; k < ng; ++k)
@@ -92,8 +102,8 @@
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my,
+                           cl_int *m, const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
@@ -103,11 +113,6 @@
         ng = ng / nw;
         cl_int tr, rr;
-        if (Which == 0)
-            log_info("  sub_group_barrier(CLK_LOCAL_MEM_FENCE)...\n");
-        else
-            log_info("  sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...\n");
         for (k = 0; k < ng; ++k)
             // Map to array indexed to array indexed by local ID and sub group
@@ -133,7 +138,7 @@
                                   "id %d in sub group %d in group %d expected "
                                   "%d got %d\n",
                                   i, j, k, tr, rr);
-                        return -1;
+                        return TEST_FAIL;
@@ -143,7 +148,7 @@
             m += 2 * nw;
-        return 0;
+        return TEST_PASS;
@@ -187,4 +192,4 @@
     return test_barrier_functions(device, context, queue, num_elements, false);
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index 428f2cd..f2bd5b9 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -225,6 +225,12 @@
 struct IFP
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  independent forward progress...%s\n", extra_text);
+    }
     static void gen(cl_int *x, cl_int *t, cl_int *,
                     const WorkGroupParams &test_params)
@@ -245,8 +251,8 @@
-    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
+                           const WorkGroupParams &test_params)
         int i, k;
         int nw = test_params.local_workgroup_size;
@@ -255,10 +261,8 @@
         int nj = (nw + ns - 1) / ns;
         ng = ng / nw;
-        // We need at least 2 sub groups per group for this tes
-        if (nj == 1) return 0;
-        log_info("  independent forward progress...\n");
+        // We need at least 2 sub groups per group for this test
+        if (nj == 1) return TEST_SKIPPED_ITSELF;
         for (k = 0; k < ng; ++k)
@@ -270,14 +274,14 @@
                         "ERROR: mismatch at element %d in work group %d\n", i,
-                    return -1;
+                    return TEST_FAIL;
             x += nj * (NUM_LOC + 1);
             y += NUM_LOC;
-        return 0;
+        return TEST_PASS;
@@ -360,17 +364,21 @@
     // ifp only in subgroup functions tests:
     test_status error;
-    error = checkIFPSupport(device, ifpSupport);
-    if (error != TEST_PASS)
+    auto device_cl_version = get_device_cl_version(device);
+    if (device_cl_version >= Version(2, 1))
-        return error;
-    }
-    if (ifpSupport == false)
-    {
-        log_info(
-            "Error reason: the extension cl_khr_subgroups requires that "
-            "Independed forward progress has to be supported by device.\n");
-        return TEST_FAIL;
+        error = checkIFPSupport(device, ifpSupport);
+        if (error != TEST_PASS)
+        {
+            return error;
+        }
+        if (ifpSupport == false)
+        {
+            log_info(
+                "Error reason: the extension cl_khr_subgroups requires that "
+                "Independed forward progress has to be supported by device.\n");
+            return TEST_FAIL;
+        }
     return test_ifp(device, context, queue, num_elements, false);
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_queries.cpp b/test_conformance/subgroups/test_queries.cpp
index 761ca7a..6b94093 100644
--- a/test_conformance/subgroups/test_queries.cpp
+++ b/test_conformance/subgroups/test_queries.cpp
@@ -100,7 +100,7 @@
     if (clGetKernelSubGroupInfo_ptr == NULL)
-        log_error("ERROR: %s function not available",
+        log_error("ERROR: %s function not available\n",
         return TEST_FAIL;
@@ -112,7 +112,7 @@
     if (error != CL_SUCCESS)
         log_error("ERROR: %s function error for "
         return TEST_FAIL;
@@ -133,7 +133,7 @@
     if (error != CL_SUCCESS)
         log_error("ERROR: %s function error "
-                  "for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE",
+                  "for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE\n",
         return TEST_FAIL;
@@ -209,4 +209,4 @@
     return test_sub_group_info(device, context, queue, num_elements, false);
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
index c0e4952..75e9d4a 100644
--- a/test_conformance/subgroups/test_subgroup.cpp
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -24,6 +24,13 @@
 // Any/All test functions
 template <NonUniformVoteOp operation> struct AA
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s...%s\n", operation_names(operation),
+                 extra_text);
+    }
     static void gen(cl_int *x, cl_int *t, cl_int *m,
                     const WorkGroupParams &test_params)
@@ -35,7 +42,6 @@
         int e;
         ng = ng / nw;
         ii = 0;
-        log_info("  sub_group_%s...\n", operation_names(operation));
         for (k = 0; k < ng; ++k)
             for (j = 0; j < nj; ++j)
@@ -68,8 +74,8 @@
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my,
+                           cl_int *m, const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int ng = test_params.global_workgroup_size;
@@ -124,51 +130,33 @@
             y += nw;
             m += 4 * nw;
-        log_info("  sub_group_%s... passed\n", operation_names(operation));
         return TEST_PASS;
-static const char *any_source = "__kernel void test_any(const __global Type "
-                                "*in, __global int4 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_any(in[gid]);\n"
-                                "}\n";
-static const char *all_source = "__kernel void test_all(const __global Type "
-                                "*in, __global int4 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_all(in[gid]);\n"
-                                "}\n";
 template <typename T>
 int run_broadcast_scan_reduction_for_type(RunTestForType rft)
     int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
-        "test_bcast", bcast_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
-                                                            redadd_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
-                                                            redmax_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
-                                                            redmin_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
-                                                             scinadd_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
-                                                             scinmax_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
-                                                             scinmin_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
-                                                             scexadd_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
-                                                             scexmax_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
-                                                             scexmin_source);
+        "sub_group_broadcast");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_inclusive_add");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_inclusive_max");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_inclusive_min");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_exclusive_add");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_exclusive_max");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_exclusive_min");
     return error;
@@ -181,11 +169,14 @@
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_reduction_scan_source);
+    test_params.save_kernel_source(sub_group_generic_source,
+                                   "sub_group_broadcast");
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error =
-        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
-    error |=
-        rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any");
+    error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all");
     error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
     error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
     error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index f2e4060..3882311 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -23,52 +23,34 @@
 // Test for ballot functions
 template <typename Ty> struct BALLOT
-    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
-        // no work here
-        int gws = test_params.global_workgroup_size;
-        int lws = test_params.local_workgroup_size;
-        int sbs = test_params.subgroup_size;
-        int non_uniform_size = gws % lws;
-        log_info("  sub_group_ballot...\n");
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
+        log_info("  sub_group_ballot...%s\n", extra_text);
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
-        int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int sb_number = (lws + sbs - 1) / sbs;
-        int current_sbs = 0;
-        cl_uint expected_result, device_result;
         int non_uniform_size = gws % lws;
         int wg_number = gws / lws;
         wg_number = non_uniform_size ? wg_number + 1 : wg_number;
         int last_subgroup_size = 0;
-        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        for (int wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             if (non_uniform_size && wg_id == wg_number - 1)
                 set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
-            for (wi_id = 0; wi_id < lws; ++wi_id)
-            { // inside the work_group
-                // read device outputs for work_group
-                my[wi_id] = y[wi_id];
-            }
-            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            for (int sb_id = 0; sb_id < sb_number; ++sb_id)
             { // for each subgroup
                 int wg_offset = sb_id * sbs;
+                int current_sbs;
                 if (last_subgroup_size && sb_id == sb_number - 1)
                     current_sbs = last_subgroup_size;
@@ -77,26 +59,121 @@
                     current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
-                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                for (int wi_id = 0; wi_id < current_sbs; wi_id++)
-                    device_result = my[wg_offset + wi_id];
-                    expected_result = 1;
-                    if (!compare(device_result, expected_result))
+                    cl_uint v;
+                    if (genrand_bool(gMTdata))
+                    {
+                        v = genrand_bool(gMTdata);
+                    }
+                    else if (genrand_bool(gMTdata))
+                    {
+                        v = 1U << ((genrand_int32(gMTdata) % 31) + 1);
+                    }
+                    else
+                    {
+                        v = genrand_int32(gMTdata);
+                    }
+                    cl_uint4 v4 = { v, 0, 0, 0 };
+                    t[wi_id + wg_offset] = v4;
+                }
+            }
+            // Now map into work group using map from device
+            for (int wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
+    {
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        int last_subgroup_size = 0;
+        for (int wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            for (int wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
+            }
+            for (int sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                bs128 expected_result_bs = 0;
+                std::set<int> active_work_items;
+                for (int wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    if (test_params.work_items_mask.test(wi_id))
+                    {
+                        bool predicate = (mx[wg_offset + wi_id].s0 != 0);
+                        expected_result_bs |= (bs128(predicate) << wi_id);
+                        active_work_items.insert(wi_id);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    continue;
+                }
+                cl_uint4 expected_result =
+                    bs128_to_cl_uint4(expected_result_bs);
+                for (const int &active_work_item : active_work_items)
+                {
+                    int wi_id = active_work_item;
+                    cl_uint4 device_result = my[wg_offset + wi_id];
+                    bs128 device_result_bs = cl_uint4_to_bs128(device_result);
+                    if (device_result_bs != expected_result_bs)
                             "ERROR: sub_group_ballot mismatch for local id "
-                            "%d in sub group %d in group %d obtained {%d}, "
-                            "expected {%d} \n",
-                            wi_id, sb_id, wg_id, device_result,
-                            expected_result);
+                            "%d in sub group %d in group %d obtained {%d, %d, "
+                            "%d, %d}, expected {%d, %d, %d, %d}\n",
+                            wi_id, sb_id, wg_id, device_result.s0,
+                            device_result.s1, device_result.s2,
+                            device_result.s3, expected_result.s0,
+                            expected_result.s1, expected_result.s2,
+                            expected_result.s3);
                         return TEST_FAIL;
+            x += lws;
             y += lws;
             m += 4 * lws;
-        log_info("  sub_group_ballot... passed\n");
         return TEST_PASS;
@@ -104,23 +181,22 @@
 // Test for bit extract ballot functions
 template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_ballot_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
-        int wi_id, sb_id, wg_id, l;
+        int wi_id, sb_id, wg_id;
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int sb_number = (lws + sbs - 1) / sbs;
         int wg_number = gws / lws;
         int limit_sbs = sbs > 100 ? 100 : sbs;
-        int non_uniform_size = gws % lws;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
@@ -155,10 +231,10 @@
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
-        int wi_id, wg_id, l, sb_id;
+        int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
@@ -260,30 +336,25 @@
             y += lws;
             m += 4 * lws;
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
 template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_inverse_ballot...%s\n", extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
-        int gws = test_params.global_workgroup_size;
-        int lws = test_params.local_workgroup_size;
-        int sbs = test_params.subgroup_size;
-        int non_uniform_size = gws % lws;
-        log_info("  sub_group_inverse_ballot...\n");
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
         // no work here
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -322,9 +393,6 @@
                     current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
-                // take index of array where info which work_item will
-                // be broadcast its value is stored
-                int midx = 4 * wg_offset + 2;
                 // take subgroup local id of this work_item
                 // Check result
                 for (wi_id = 0; wi_id < current_sbs; ++wi_id)
@@ -354,7 +422,6 @@
             m += 4 * lws;
-        log_info("  sub_group_inverse_ballot... passed\n");
         return TEST_PASS;
@@ -363,6 +430,13 @@
 // Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function
 template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int wi_id, wg_id, sb_id;
@@ -375,14 +449,10 @@
         int last_subgroup_size = 0;
         int current_sbs = 0;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         if (non_uniform_size)
-            log_info("  non uniform work group size mode ON\n");
-        int e;
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             if (non_uniform_size && wg_id == wg_number - 1)
@@ -423,7 +493,7 @@
-                    log_error("Unknown operation...");
+                    log_error("Unknown operation...\n");
@@ -451,15 +521,15 @@
         else if (operation == BallotOp::ballot_inclusive_scan
                  || operation == BallotOp::ballot_exclusive_scan)
-            for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i);
-            if (operation == BallotOp::ballot_exclusive_scan)
-                mask.reset(sub_group_local_id);
+            for (cl_uint i = 0; i < sub_group_local_id; ++i) mask.set(i);
+            if (operation == BallotOp::ballot_inclusive_scan)
+                mask.set(sub_group_local_id);
         return mask;
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -469,7 +539,7 @@
         int non_uniform_size = gws % lws;
         int wg_number = gws / lws;
         wg_number = non_uniform_size ? wg_number + 1 : wg_number;
-        cl_uint4 expected_result, device_result;
+        cl_uint expected_result, device_result;
         int last_subgroup_size = 0;
         int current_sbs = 0;
@@ -501,7 +571,7 @@
                     current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
                 // Check result
-                expected_result = { 0, 0, 0, 0 };
+                expected_result = 0;
                 for (wi_id = 0; wi_id < current_sbs; ++wi_id)
                 { // for subgroup element
                     bs128 bs;
@@ -510,34 +580,37 @@
                         | (bs128(mx[wg_offset + wi_id].s1) << 32)
                         | (bs128(mx[wg_offset + wi_id].s2) << 64)
                         | (bs128(mx[wg_offset + wi_id].s3) << 96);
-                    bs &= getImportantBits(wi_id, current_sbs);
-                    device_result = my[wg_offset + wi_id];
+                    bs &= getImportantBits(wi_id, sbs);
+                    device_result = my[wg_offset + wi_id].s0;
                     if (operation == BallotOp::ballot_inclusive_scan
                         || operation == BallotOp::ballot_exclusive_scan
                         || operation == BallotOp::ballot_bit_count)
-                        expected_result.s0 = bs.count();
+                        expected_result = bs.count();
                         if (!compare(device_result, expected_result))
                             log_error("ERROR: sub_group_%s "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
                                       operation_names(operation), wi_id, sb_id,
-                                      wg_id, device_result.s0, device_result.s1,
-                                      device_result.s2, device_result.s3,
-                                      expected_result.s0, expected_result.s1,
-                                      expected_result.s2, expected_result.s3);
+                                      wg_id, device_result, expected_result);
                             return TEST_FAIL;
                     else if (operation == BallotOp::ballot_find_lsb)
-                        for (int id = 0; id < current_sbs; ++id)
+                        if (bs.none())
+                        {
+                            // Return value is undefined when no bits are set,
+                            // so skip validation:
+                            continue;
+                        }
+                        for (int id = 0; id < sbs; ++id)
                             if (bs.test(id))
-                                expected_result.s0 = id;
+                                expected_result = id;
@@ -545,23 +618,26 @@
                             log_error("ERROR: sub_group_ballot_find_lsb "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
-                                      wi_id, sb_id, wg_id, device_result.s0,
-                                      device_result.s1, device_result.s2,
-                                      device_result.s3, expected_result.s0,
-                                      expected_result.s1, expected_result.s2,
-                                      expected_result.s3);
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
+                                      wi_id, sb_id, wg_id, device_result,
+                                      expected_result);
                             return TEST_FAIL;
                     else if (operation == BallotOp::ballot_find_msb)
-                        for (int id = current_sbs - 1; id >= 0; --id)
+                        if (bs.none())
+                        {
+                            // Return value is undefined when no bits are set,
+                            // so skip validation:
+                            continue;
+                        }
+                        for (int id = sbs - 1; id >= 0; --id)
                             if (bs.test(id))
-                                expected_result.s0 = id;
+                                expected_result = id;
@@ -569,13 +645,10 @@
                             log_error("ERROR: sub_group_ballot_find_msb "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
-                                      wi_id, sb_id, wg_id, device_result.s0,
-                                      device_result.s1, device_result.s2,
-                                      device_result.s3, expected_result.s0,
-                                      expected_result.s1, expected_result.s2,
-                                      expected_result.s3);
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
+                                      wi_id, sb_id, wg_id, device_result,
+                                      expected_result);
                             return TEST_FAIL;
@@ -585,8 +658,6 @@
             y += lws;
             m += 4 * lws;
-        log_info("  sub_group_ballot_%s(%s)... passed\n",
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
@@ -594,15 +665,21 @@
 // test mask functions
 template <typename Ty, BallotOp operation> struct SMASK
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  get_sub_group_%s_mask...%s\n", operation_names(operation),
+                 extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
-        int wi_id, wg_id, l, sb_id;
+        int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int sb_number = (lws + sbs - 1) / sbs;
         int wg_number = gws / lws;
-        log_info("  get_sub_group_%s_mask...\n", operation_names(operation));
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             for (sb_id = 0; sb_id < sb_number; ++sb_id)
@@ -631,8 +708,8 @@
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -678,245 +755,130 @@
             y += lws;
             m += 4 * lws;
-        log_info("  get_sub_group_%s_mask... passed\n",
-                 operation_names(operation));
         return TEST_PASS;
-static const char *bcast_non_uniform_source =
-    "__kernel void test_bcast_non_uniform(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
-    "        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
-    "    } else {\n"
-    "       out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
-    "    }\n"
-    "}\n";
-static const char *bcast_first_source =
-    "__kernel void test_bcast_first(const __global Type *in, __global int4 "
-    "*xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
-    "       out[gid] = sub_group_broadcast_first(x);\n"
-    "    } else {\n"
-    "       out[gid] = sub_group_broadcast_first(x);\n"
-    "    }\n"
-    "}\n";
-static const char *ballot_bit_count_source =
-    "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-static const char *ballot_inclusive_scan_source =
-    "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
-    "*in, __global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-static const char *ballot_exclusive_scan_source =
-    "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
-    "*in, __global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-static const char *ballot_find_lsb_source =
-    "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-static const char *ballot_find_msb_source =
-    "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);"
-    "    value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
-    "    out[gid] = value ;"
-    "}\n";
-static const char *get_subgroup_ge_mask_source =
-    "__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_ge_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-static const char *get_subgroup_gt_mask_source =
-    "__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_gt_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-static const char *get_subgroup_le_mask_source =
-    "__kernel void test_get_sub_group_le_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_le_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-static const char *get_subgroup_lt_mask_source =
-    "__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_lt_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-static const char *get_subgroup_eq_mask_source =
-    "__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_eq_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-static const char *ballot_source =
-    "__kernel void test_sub_group_ballot(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "uint4 full_ballot = sub_group_ballot(1);\n"
-    "uint divergence_mask;\n"
-    "uint4 partial_ballot;\n"
-    "uint gid = get_global_id(0);"
-    "XY(xy,gid);\n"
-    "if (get_sub_group_local_id() & 1) {\n"
-    "    divergence_mask = 0xaaaaaaaa;\n"
-    "    partial_ballot = sub_group_ballot(1);\n"
-    "} else {\n"
-    "    divergence_mask = 0x55555555;\n"
-    "    partial_ballot = sub_group_ballot(1);\n"
-    "}\n"
-    " size_t lws = get_local_size(0);\n"
-    "uint4 masked_ballot = full_ballot;\n"
-    "masked_ballot.x &= divergence_mask;\n"
-    "masked_ballot.y &= divergence_mask;\n"
-    "masked_ballot.z &= divergence_mask;\n"
-    "masked_ballot.w &= divergence_mask;\n"
-    "out[gid] = all(masked_ballot == partial_ballot);\n"
-    "} \n";
-static const char *ballot_source_inverse =
-    "__kernel void test_sub_group_ballot_inverse(const __global "
-    "Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(10,0,0,0);\n"
-    "    if (get_sub_group_local_id() & 1) {"
-    "        uint4 partial_ballot_mask = "
-    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
-    "            value = (uint4)(1,0,0,1);\n"
-    "        } else {\n"
-    "            value = (uint4)(0,0,0,1);\n"
-    "        }\n"
-    "    } else {\n"
-    "       uint4 partial_ballot_mask = "
-    "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
-    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
-    "            value = (uint4)(1,0,0,2);\n"
-    "        } else {\n"
-    "            value = (uint4)(0,0,0,2);\n"
-    "        }\n"
-    "    }\n"
-    "    out[gid] = value;\n"
-    "}\n";
-static const char *ballot_bit_extract_source =
-    "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint index = xy[gid].z;\n"
-    "    uint4 value = (uint4)(10,0,0,0);\n"
-    "    if (get_sub_group_local_id() & 1) {"
-    "       if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
-    "           value = (uint4)(1,0,0,1);\n"
-    "       } else {\n"
-    "           value = (uint4)(0,0,0,1);\n"
-    "       }\n"
-    "    } else {\n"
-    "       if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
-    "           value = (uint4)(1,0,0,2);\n"
-    "       } else {\n"
-    "           value = (uint4)(0,0,0,2);\n"
-    "       }\n"
-    "    }\n"
-    "    out[gid] = value;\n"
-    "}\n";
+std::string sub_group_non_uniform_broadcast_source = R"(
+__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
+        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);
+    } else {
+        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);
+    }
+std::string sub_group_broadcast_first_source = R"(
+__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
+        out[gid] = sub_group_broadcast_first(x);;
+    } else {
+        out[gid] = sub_group_broadcast_first(x);;
+    }
+std::string sub_group_ballot_bit_scan_find_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint4 value = (uint4)(0,0,0,0);
+    value = (uint4)(%s(x),0,0,0);
+    out[gid] = value;
+std::string sub_group_ballot_mask_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    xy[gid].z = get_max_sub_group_size();
+    Type x = in[gid];
+    uint4 mask = %s();
+    out[gid] = mask;
+std::string sub_group_ballot_source = R"(
+__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
+    uint gid = get_global_id(0);
+    XY(xy,gid);
+    uint subgroup_local_id = get_sub_group_local_id();
+    uint elect_work_item = 1 << (subgroup_local_id % 32);
+    uint work_item_mask;
+    if (subgroup_local_id < 32) {
+        work_item_mask = work_item_mask_vector.x;
+    } else if(subgroup_local_id < 64) {
+        work_item_mask = work_item_mask_vector.y;
+    } else if(subgroup_local_id < 96) {
+        work_item_mask = work_item_mask_vector.z;
+    } else if(subgroup_local_id < 128) {
+        work_item_mask = work_item_mask_vector.w;
+    }
+    uint4 value = (uint4)(0, 0, 0, 0);
+    if (elect_work_item & work_item_mask) {
+        value = sub_group_ballot(in[gid].s0);
+    }
+    out[gid] = value;
+std::string sub_group_inverse_ballot_source = R"(
+__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint4 value = (uint4)(10,0,0,0);
+    if (get_sub_group_local_id() & 1) {
+        uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);
+        if (sub_group_inverse_ballot(partial_ballot_mask)) {
+            value = (uint4)(1,0,0,1);
+        } else {
+            value = (uint4)(0,0,0,1);
+        }
+    } else {
+        uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555);
+        if (sub_group_inverse_ballot(partial_ballot_mask)) {
+            value = (uint4)(1,0,0,2);
+        } else {
+            value = (uint4)(0,0,0,2);
+        }
+    }
+    out[gid] = value;
+std::string sub_group_ballot_bit_extract_source = R"(
+ __kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint index = xy[gid].z;
+    uint4 value = (uint4)(10,0,0,0);
+    if (get_sub_group_local_id() & 1) {
+        if (sub_group_ballot_bit_extract(x, xy[gid].z)) {
+            value = (uint4)(1,0,0,1);
+        } else {
+            value = (uint4)(0,0,0,1);
+        }
+    } else {
+        if (sub_group_ballot_bit_extract(x, xy[gid].w)) {
+            value = (uint4)(1,0,0,2);
+        } else {
+            value = (uint4)(0,0,0,2);
+        }
+    }
+    out[gid] = value;
 template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
     int error =
         rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
-            "test_bcast_non_uniform", bcast_non_uniform_source);
+            "sub_group_non_uniform_broadcast");
     return error;
@@ -926,11 +888,21 @@
 int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
                                    cl_command_queue queue, int num_elements)
-    std::vector<std::string> required_extensions = { "cl_khr_subgroup_ballot" };
+    if (!is_extension_available(device, "cl_khr_subgroup_ballot"))
+    {
+        log_info("cl_khr_subgroup_ballot is not supported on this device, "
+                 "skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_ballot_mask_source);
+    test_params.save_kernel_source(sub_group_non_uniform_broadcast_source,
+                                   "sub_group_non_uniform_broadcast");
+    test_params.save_kernel_source(sub_group_broadcast_first_source,
+                                   "sub_group_broadcast_first");
     RunTestForType rft(device, context, queue, num_elements, test_params);
     // non uniform broadcast functions
@@ -1014,76 +986,92 @@
     // broadcast first functions
     error |=
         rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
-            "test_bcast_first", bcast_first_source);
+            "sub_group_broadcast_first");
     error |= rft.run_impl<cl_uint,
                           BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_long,
                           BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_ulong,
                           BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_short,
                           BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_ushort,
                           BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_char,
                           BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_uchar,
                           BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_float,
                           BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_double,
                           BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<
         BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     // mask functions
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
-        "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
+        "get_sub_group_eq_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
-        "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
+        "get_sub_group_ge_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
-        "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
+        "get_sub_group_gt_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
-        "test_get_sub_group_le_mask", get_subgroup_le_mask_source);
+        "get_sub_group_le_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
-        "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
+        "get_sub_group_lt_mask");
-    // ballot functions
-    error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
-                                                    ballot_source);
-    error |= rft.run_impl<cl_uint4,
-                          BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
-        "test_sub_group_ballot_inverse", ballot_source_inverse);
-    error |= rft.run_impl<
+    // sub_group_ballot function
+    WorkGroupParams test_params_ballot(global_work_size, local_work_size, 3);
+    test_params_ballot.save_kernel_source(sub_group_ballot_source);
+    RunTestForType rft_ballot(device, context, queue, num_elements,
+                              test_params_ballot);
+    error |=
+        rft_ballot.run_impl<cl_uint4, BALLOT<cl_uint4>>("sub_group_ballot");
+    // ballot arithmetic functions
+    WorkGroupParams test_params_arith(global_work_size, local_work_size);
+    test_params_arith.save_kernel_source(sub_group_ballot_bit_scan_find_source);
+    test_params_arith.save_kernel_source(sub_group_inverse_ballot_source,
+                                         "sub_group_inverse_ballot");
+    test_params_arith.save_kernel_source(sub_group_ballot_bit_extract_source,
+                                         "sub_group_ballot_bit_extract");
+    RunTestForType rft_arith(device, context, queue, num_elements,
+                             test_params_arith);
+    error |=
+        rft_arith.run_impl<cl_uint4,
+                           BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
+            "sub_group_inverse_ballot");
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
-        "test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_bit_extract");
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
-        "test_sub_group_ballot_bit_count", ballot_bit_count_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_bit_count");
+    error |= rft_arith.run_impl<
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
-        "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_inclusive_scan");
+    error |= rft_arith.run_impl<
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
-        "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_exclusive_scan");
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
-        "test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_find_lsb");
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
-        "test_sub_group_ballot_find_msb", ballot_find_msb_source);
+        "sub_group_ballot_find_msb");
     return error;
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index 588e9ce..38652d5 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -18,172 +18,55 @@
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
-#define CLUSTER_SIZE 4
-#define CLUSTER_SIZE_STR "4"
 namespace {
-static const char *redadd_clustered_source =
-    "__kernel void test_redadd_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redmax_clustered_source =
-    "__kernel void test_redmax_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redmin_clustered_source =
-    "__kernel void test_redmin_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redmul_clustered_source =
-    "__kernel void test_redmul_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redand_clustered_source =
-    "__kernel void test_redand_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redor_clustered_source =
-    "__kernel void test_redor_clustered(const __global Type *in, __global int4 "
-    "*xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redxor_clustered_source =
-    "__kernel void test_redxor_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-static const char *redand_clustered_logical_source =
-    "__kernel void test_redand_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
-static const char *redor_clustered_logical_source =
-    "__kernel void test_redor_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
-static const char *redxor_clustered_logical_source =
-    "__kernel void test_redxor_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if ( sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
+std::string sub_group_clustered_reduce_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out,
+                      uint cluster_size) {
+        Type r;
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        xy[gid].w = 0;
+        Type v = in[gid];
+        if (sizeof(in[gid]) == sizeof(%s(v, 1))) {
+            xy[gid].w = sizeof(in[gid]);
+        }
+        switch (cluster_size) {
+            case 1: r = %s(v, 1); break;
+            case 2: r = %s(v, 2); break;
+            case 4: r = %s(v, 4); break;
+            case 8: r = %s(v, 8); break;
+            case 16: r = %s(v, 16); break;
+            case 32: r = %s(v, 32); break;
+            case 64: r = %s(v, 64); break;
+            case 128: r = %s(v, 128); break;
+        }
+        out[gid] = r;
 // Test for reduce cluster functions
 template <typename Ty, ArithmeticOp operation> struct RED_CLU
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...%s\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty), extra_text);
+    }
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
-                 operation_names(operation), TypeManager<Ty>::name(),
-                 sizeof(Ty));
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
@@ -219,34 +102,34 @@
                 int ii = j * ns;
                 int n = ii + ns > nw ? nw - ii : ns;
-                int midx = 4 * ii + 2;
                 std::vector<Ty> clusters_results;
-                int clusters_counter = ns / CLUSTER_SIZE;
+                int clusters_counter = ns / test_params.cluster_size;
                 // Compute target
                 Ty tr = mx[ii];
                 for (int i = 0; i < n; ++i)
-                    if (i % CLUSTER_SIZE == 0)
+                    if (i % test_params.cluster_size == 0)
                         tr = mx[ii + i];
                         tr = calculate<Ty>(tr, mx[ii + i], operation);
-                    clusters_results[i / CLUSTER_SIZE] = tr;
+                    clusters_results[i / test_params.cluster_size] = tr;
                 // Check result
                 for (int i = 0; i < n; ++i)
                     Ty rr = my[ii + i];
-                    tr = clusters_results[i / CLUSTER_SIZE];
+                    tr = clusters_results[i / test_params.cluster_size];
                     if (!compare(rr, tr))
-                            "ERROR: sub_group_clustered_reduce_%s(%s) mismatch "
-                            "for local id %d in sub group %d in group %d\n",
+                            "ERROR: sub_group_clustered_reduce_%s(%s, %u) "
+                            "mismatch for local id %d in sub group %d in group "
+                            "%d\n",
                             operation_names(operation), TypeManager<Ty>::name(),
-                            i, j, k);
+                            test_params.cluster_size, i, j, k);
                         return TEST_FAIL;
@@ -256,9 +139,6 @@
             y += nw;
             m += 4 * nw;
-        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
-                 operation_names(operation), TypeManager<Ty>::name(),
-                 sizeof(Ty));
         return TEST_PASS;
@@ -267,34 +147,34 @@
 int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
-        "test_redadd_clustered", redadd_clustered_source);
+        "sub_group_clustered_reduce_add");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
-        "test_redmax_clustered", redmax_clustered_source);
+        "sub_group_clustered_reduce_max");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
-        "test_redmin_clustered", redmin_clustered_source);
+        "sub_group_clustered_reduce_min");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
-        "test_redmul_clustered", redmul_clustered_source);
+        "sub_group_clustered_reduce_mul");
     return error;
 template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
-        "test_redand_clustered", redand_clustered_source);
+        "sub_group_clustered_reduce_and");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
-        "test_redor_clustered", redor_clustered_source);
+        "sub_group_clustered_reduce_or");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
-        "test_redxor_clustered", redxor_clustered_source);
+        "sub_group_clustered_reduce_xor");
     return error;
 template <typename T>
 int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
-        "test_redand_clustered_logical", redand_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_and");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
-        "test_redor_clustered_logical", redor_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_or");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
-        "test_redxor_clustered_logical", redxor_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_xor");
     return error;
@@ -305,13 +185,17 @@
                                              cl_command_queue queue,
                                              int num_elements)
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_clustered_reduce"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_clustered_reduce"))
+    {
+        log_info("cl_khr_subgroup_clustered_reduce is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size, -1, 3);
+    test_params.save_kernel_source(sub_group_clustered_reduce_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
index 98401b8..c9e6bb6 100644
--- a/test_conformance/subgroups/test_subgroup_extended_types.cpp
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -24,30 +24,30 @@
 template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
     int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
-        "test_bcast", bcast_source);
+        "sub_group_broadcast");
     return error;
 template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
-    int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
-                                                               redadd_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
-                                                            redmax_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
-                                                            redmin_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
-                                                             scinadd_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
-                                                             scinmax_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
-                                                             scinmin_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
-                                                             scexadd_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
-                                                             scexmax_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
-                                                             scexmin_source);
+    int error =
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_inclusive_add");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_inclusive_max");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_inclusive_min");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_exclusive_add");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_exclusive_max");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_exclusive_min");
     return error;
@@ -59,15 +59,21 @@
                                            cl_command_queue queue,
                                            int num_elements)
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_extended_types"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_extended_types"))
+    {
+        log_info("cl_khr_subgroup_extended_types is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
-    RunTestForType rft(device, context, queue, num_elements, test_params);
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_reduction_scan_source);
+    test_params.save_kernel_source(sub_group_generic_source,
+                                   "sub_group_broadcast");
+    RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_broadcast_for_extended_type<cl_uint2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
     error |= run_broadcast_for_extended_type<cl_uint4>(rft);
@@ -102,22 +108,26 @@
     error |= run_broadcast_for_extended_type<cl_double8>(rft);
     error |= run_broadcast_for_extended_type<cl_double16>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
+    error |= run_broadcast_for_extended_type<cl_short>(rft);
     error |= run_broadcast_for_extended_type<cl_short2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
     error |= run_broadcast_for_extended_type<cl_short4>(rft);
     error |= run_broadcast_for_extended_type<cl_short8>(rft);
     error |= run_broadcast_for_extended_type<cl_short16>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
+    error |= run_broadcast_for_extended_type<cl_char>(rft);
     error |= run_broadcast_for_extended_type<cl_char2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
     error |= run_broadcast_for_extended_type<cl_char4>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
index eb46ff0..02fc507 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -20,333 +20,25 @@
 namespace {
-static const char *scinadd_non_uniform_source = R"(
-    __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_non_uniform_arithmetic_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
-            }
-    }
-static const char *scinmax_non_uniform_source = R"(
-    __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
-            }
-    }
-static const char *scinmin_non_uniform_source = R"(
-    __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
-            }
-    }
-static const char *scinmul_non_uniform_source = R"(
-    __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
-            }
-    }
-static const char *scinand_non_uniform_source = R"(
-    __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
-            }
-    }
-static const char *scinor_non_uniform_source = R"(
-    __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
-            }
-    }
-static const char *scinxor_non_uniform_source = R"(
-    __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
-            }
-    }
-static const char *scinand_non_uniform_logical_source = R"(
-    __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
-            }
-    }
-static const char *scinor_non_uniform_logical_source = R"(
-    __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
-            }
-    }
-static const char *scinxor_non_uniform_logical_source = R"(
-    __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
-            }
-    }
-static const char *scexadd_non_uniform_source = R"(
-    __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
-            }
-    }
-static const char *scexmax_non_uniform_source = R"(
-    __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
-            }
-    }
-static const char *scexmin_non_uniform_source = R"(
-    __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
-            }
-    }
-static const char *scexmul_non_uniform_source = R"(
-    __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
-            }
-    }
-static const char *scexand_non_uniform_source = R"(
-    __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
-            }
-    }
-static const char *scexor_non_uniform_source = R"(
-    __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
-            }
-    }
-static const char *scexxor_non_uniform_source = R"(
-    __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
-            }
-    }
-static const char *scexand_non_uniform_logical_source = R"(
-    __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
-            }
-    }
-static const char *scexor_non_uniform_logical_source = R"(
-    __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
-            }
-    }
-static const char *scexxor_non_uniform_logical_source = R"(
-    __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
-            }
-    }
-static const char *redadd_non_uniform_source = R"(
-    __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
-            }
-    }
-static const char *redmax_non_uniform_source = R"(
-    __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
-            }
-    }
-static const char *redmin_non_uniform_source = R"(
-    __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
-            }
-    }
-static const char *redmul_non_uniform_source = R"(
-    __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
-            }
-    }
-static const char *redand_non_uniform_source = R"(
-    __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
-            }
-    }
-static const char *redor_non_uniform_source = R"(
-    __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
-            }
-    }
-static const char *redxor_non_uniform_source = R"(
-    __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
-            }
-    }
-static const char *redand_non_uniform_logical_source = R"(
-    __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
-            }
-    }
-static const char *redor_non_uniform_logical_source = R"(
-    __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
-            }
-    }
-static const char *redxor_non_uniform_logical_source = R"(
-    __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
-            }
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
+        }
+        if (elect_work_item & work_item_mask){
+            out[gid] = %s(in[gid]);
+        }
@@ -354,52 +46,52 @@
 int run_functions_add_mul_max_min_for_type(RunTestForType rft)
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
-        "test_scinadd_non_uniform", scinadd_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_add");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
-        "test_scinmul_non_uniform", scinmul_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_mul");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
-        "test_scinmax_non_uniform", scinmax_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_max");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
-        "test_scinmin_non_uniform", scinmin_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_min");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
-        "test_scexadd_non_uniform", scexadd_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_add");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
-        "test_scexmul_non_uniform", scexmul_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_mul");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
-        "test_scexmax_non_uniform", scexmax_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_max");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
-        "test_scexmin_non_uniform", scexmin_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_min");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
-        "test_redadd_non_uniform", redadd_non_uniform_source);
+        "sub_group_non_uniform_reduce_add");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
-        "test_redmul_non_uniform", redmul_non_uniform_source);
+        "sub_group_non_uniform_reduce_mul");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
-        "test_redmax_non_uniform", redmax_non_uniform_source);
+        "sub_group_non_uniform_reduce_max");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
-        "test_redmin_non_uniform", redmin_non_uniform_source);
+        "sub_group_non_uniform_reduce_min");
     return error;
 template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
-        "test_scinand_non_uniform", scinand_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_and");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
-        "test_scinor_non_uniform", scinor_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_or");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
-        "test_scinxor_non_uniform", scinxor_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_xor");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
-        "test_scexand_non_uniform", scexand_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_and");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
-        "test_scexor_non_uniform", scexor_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_or");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
-        "test_scexxor_non_uniform", scexxor_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_xor");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
-        "test_redand_non_uniform", redand_non_uniform_source);
+        "sub_group_non_uniform_reduce_and");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
-        "test_redor_non_uniform", redor_non_uniform_source);
+        "sub_group_non_uniform_reduce_or");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
-        "test_redxor_non_uniform", redxor_non_uniform_source);
+        "sub_group_non_uniform_reduce_xor");
     return error;
@@ -407,23 +99,23 @@
 int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
-        "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_and");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
-        "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_or");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
-        "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_xor");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
-        "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_and");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
-        "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_or");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
-        "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_xor");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
-        "test_redand_non_uniform_logical", redand_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_and");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
-        "test_redor_non_uniform_logical", redor_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_or");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
-        "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_xor");
     return error;
@@ -434,17 +126,18 @@
                                                    cl_command_queue queue,
                                                    int num_elements)
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_non_uniform_arithmetic"
-    };
-    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
-                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
-                                 0x00ffff00, 0x80000000, 0xaaaaaaaa };
+    if (!is_extension_available(device,
+                                "cl_khr_subgroup_non_uniform_arithmetic"))
+    {
+        log_info("cl_khr_subgroup_non_uniform_arithmetic is not supported on "
+                 "this device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, 3);
+    test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
@@ -470,4 +163,4 @@
     error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
     return error;
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index 2b00b4d..3be1ba3 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -22,31 +22,27 @@
 template <typename T, NonUniformVoteOp operation> struct VOTE
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s%s(%s)...%s\n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation), TypeManager<T>::name(),
+                 extra_text);
+    }
     static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
         int i, ii, j, k, n;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         int non_uniform_size = ng % nw;
         ng = ng / nw;
         int last_subgroup_size = 0;
         ii = 0;
-        log_info("  sub_group_%s%s... \n",
-                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation));
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x data type (%s)\n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask,
-                 TypeManager<T>::name());
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
         if (operation == NonUniformVoteOp::elect) return;
         for (k = 0; k < ng; ++k)
@@ -92,14 +88,13 @@
-    static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(T *x, T *y, T *mx, T *my, cl_int *m,
+                           const WorkGroupParams &test_params)
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         cl_int tr, rr;
         int non_uniform_size = ng % nw;
@@ -141,8 +136,7 @@
                 std::set<int> active_work_items;
                 for (i = 0; i < n; ++i)
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (work_items_mask & check_work_item)
+                    if (test_params.work_items_mask.test(i))
                         switch (operation)
@@ -172,34 +166,28 @@
                 if (active_work_items.empty())
-                    log_info("  no one workitem acitve... in workgroup id = %d "
-                             "subgroup id = %d\n",
-                             k, j);
+                    continue;
-                else
+                auto lowest_active = active_work_items.begin();
+                for (const int &active_work_item : active_work_items)
-                    auto lowest_active = active_work_items.begin();
-                    for (const int &active_work_item : active_work_items)
+                    i = active_work_item;
+                    if (operation == NonUniformVoteOp::elect)
-                        i = active_work_item;
-                        if (operation == NonUniformVoteOp::elect)
-                        {
-                            i == *lowest_active ? tr = 1 : tr = 0;
-                        }
+                        i == *lowest_active ? tr = 1 : tr = 0;
+                    }
-                        // normalize device values on host, non zero set 1.
-                        rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
+                    // normalize device values on host, non zero set 1.
+                    rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
-                        if (rr != tr)
-                        {
-                            log_error("ERROR: sub_group_%s() \n",
-                                      operation_names(operation));
-                            log_error(
-                                "mismatch for work item %d sub group %d in "
-                                "work group %d. Expected: %d Obtained: %d\n",
-                                i, j, k, tr, rr);
-                            return TEST_FAIL;
-                        }
+                    if (rr != tr)
+                    {
+                        log_error("ERROR: sub_group_%s() \n",
+                                  operation_names(operation));
+                        log_error("mismatch for work item %d sub group %d in "
+                                  "work group %d. Expected: %d Obtained: %d\n",
+                                  i, j, k, tr, rr);
+                        return TEST_FAIL;
@@ -209,52 +197,50 @@
             m += 4 * nw;
-        log_info("  sub_group_%s%s... passed\n",
-                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation));
         return TEST_PASS;
-static const char *elect_source = R"(
-    __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_elect_source = R"(
+    __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_elect();
-            }
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
+        }
+        if (elect_work_item & work_item_mask){
+            out[gid] = sub_group_elect();
+        }
-static const char *non_uniform_any_source = R"(
-    __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_non_uniform_any_all_all_equal_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_any(in[gid]);
-            }
-    }
-static const char *non_uniform_all_source = R"(
-    __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_all(in[gid]);
-            }
-    }
-static const char *non_uniform_all_equal_source = R"(
-    __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_all_equal(in[gid]);
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
+        }
+        if (elect_work_item & work_item_mask){
+                out[gid] = %s(in[gid]);
@@ -262,7 +248,7 @@
 template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
     int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
-        "test_non_uniform_all_equal", non_uniform_all_equal_source);
+        "sub_group_non_uniform_all_equal");
     return error;
@@ -272,17 +258,19 @@
                                              cl_command_queue queue,
                                              int num_elements)
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_non_uniform_vote"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_non_uniform_vote"))
+    {
+        log_info("cl_khr_subgroup_non_uniform_vote is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
-    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
-                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
-                                 0x00ffff00, 0x80000000 };
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, 3);
+    test_params.save_kernel_source(
+        sub_group_non_uniform_any_all_all_equal_source);
+    test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_vote_all_equal_for_type<cl_int>(rft);
@@ -294,10 +282,10 @@
     error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
-        "test_non_uniform_all", non_uniform_all_source);
+        "sub_group_non_uniform_all");
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
-        "test_elect", elect_source);
+        "sub_group_elect");
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
-        "test_non_uniform_any", non_uniform_any_source);
+        "sub_group_non_uniform_any");
     return error;
diff --git a/test_conformance/subgroups/test_subgroup_rotate.cpp b/test_conformance/subgroups/test_subgroup_rotate.cpp
new file mode 100644
index 0000000..db0f48e
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_rotate.cpp
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_kernels.h"
+#include "subgroup_common_templates.h"
+#include "harness/conversions.h"
+#include "harness/typeWrappers.h"
+namespace {
+template <typename T> int run_rotate_for_type(RunTestForType rft)
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::rotate>>("sub_group_rotate");
+    return error;
+std::string sub_group_clustered_rotate_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out,
+                          uint cluster_size) {
+        Type r;
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        Type x = in[gid];
+        int delta = xy[gid].z;
+        switch (cluster_size) {
+            case 1: r = %s(x, delta, 1); break;
+            case 2: r = %s(x, delta, 2); break;
+            case 4: r = %s(x, delta, 4); break;
+            case 8: r = %s(x, delta, 8); break;
+            case 16: r = %s(x, delta, 16); break;
+            case 32: r = %s(x, delta, 32); break;
+            case 64: r = %s(x, delta, 64); break;
+            case 128: r = %s(x, delta, 128); break;
+        }
+        out[gid] = r;
+    }
+template <typename T> int run_clustered_rotate_for_type(RunTestForType rft)
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::clustered_rotate>>(
+        "sub_group_clustered_rotate");
+    return error;
+int test_subgroup_functions_rotate(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements)
+    if (!is_extension_available(device, "cl_khr_subgroup_rotate"))
+    {
+        log_info("cl_khr_subgroup_rotate is not supported on this device, "
+                 "skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_generic_source);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+    int error = run_rotate_for_type<cl_int>(rft);
+    error |= run_rotate_for_type<cl_uint>(rft);
+    error |= run_rotate_for_type<cl_long>(rft);
+    error |= run_rotate_for_type<cl_ulong>(rft);
+    error |= run_rotate_for_type<cl_short>(rft);
+    error |= run_rotate_for_type<cl_ushort>(rft);
+    error |= run_rotate_for_type<cl_char>(rft);
+    error |= run_rotate_for_type<cl_uchar>(rft);
+    error |= run_rotate_for_type<cl_float>(rft);
+    error |= run_rotate_for_type<cl_double>(rft);
+    error |= run_rotate_for_type<subgroups::cl_half>(rft);
+    WorkGroupParams test_params_clustered(global_work_size, local_work_size, -1,
+                                          3);
+    test_params_clustered.save_kernel_source(sub_group_clustered_rotate_source);
+    RunTestForType rft_clustered(device, context, queue, num_elements,
+                                 test_params_clustered);
+    error |= run_clustered_rotate_for_type<cl_int>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_uint>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_long>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_ulong>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_short>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_ushort>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_char>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_uchar>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_float>(rft_clustered);
+    error |= run_clustered_rotate_for_type<cl_double>(rft_clustered);
+    error |= run_clustered_rotate_for_type<subgroups::cl_half>(rft_clustered);
+    return error;
diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp
index 049f098..56231cb 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -15,38 +15,19 @@
 #include "procs.h"
 #include "subhelpers.h"
+#include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
 #include <bitset>
 namespace {
-static const char* shuffle_xor_source =
-    "__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
-    "}\n";
-static const char* shuffle_source =
-    "__kernel void test_sub_group_shuffle(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle(x, xy[gid].z);"
-    "}\n";
 template <typename T> int run_shuffle_for_type(RunTestForType rft)
-    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
-        "test_sub_group_shuffle", shuffle_source);
+    int error =
+        rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle");
     error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
-        "test_sub_group_shuffle_xor", shuffle_xor_source);
+        "sub_group_shuffle_xor");
     return error;
@@ -55,11 +36,17 @@
 int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
                                     cl_command_queue queue, int num_elements)
-    std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
+    if (!is_extension_available(device, "cl_khr_subgroup_shuffle"))
+    {
+        log_info("cl_khr_subgroup_shuffle is not supported on this device, "
+                 "skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_generic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_shuffle_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
index 6000c97..caa1dcc 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -15,37 +15,19 @@
 #include "procs.h"
 #include "subhelpers.h"
+#include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 namespace {
-static const char* shuffle_down_source =
-    "__kernel void test_sub_group_shuffle_down(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
-    "}\n";
-static const char* shuffle_up_source =
-    "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
-    "}\n";
 template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
-    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
-        "test_sub_group_shuffle_up", shuffle_up_source);
+    int error =
+        rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up");
     error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
-        "test_sub_group_shuffle_down", shuffle_down_source);
+        "sub_group_shuffle_down");
     return error;
@@ -56,13 +38,17 @@
                                              cl_command_queue queue,
                                              int num_elements)
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_shuffle_relative"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_shuffle_relative"))
+    {
+        log_info("cl_khr_subgroup_shuffle_relative is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_generic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_shuffle_relative_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_workitem.cpp b/test_conformance/subgroups/test_workitem.cpp
index 7ffa6a7..b69f313 100644
--- a/test_conformance/subgroups/test_workitem.cpp
+++ b/test_conformance/subgroups/test_workitem.cpp
@@ -16,6 +16,7 @@
 #include "procs.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
+#include <CL/cl.h>
 struct get_test_data
@@ -251,8 +252,21 @@
     global = local * 5;
-    // Make sure we have a flexible range
-    global += 3 * local / 4;
+    // Non-uniform work-groups are an optional feature from 3.0 onward.
+    cl_bool device_supports_non_uniform_wg = CL_TRUE;
+    if (get_device_cl_version(device) >= Version(3, 0))
+    {
+        error = clGetDeviceInfo(
+            device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
+            &device_supports_non_uniform_wg, nullptr);
+        test_error(error, "clGetDeviceInfo failed");
+    }
+    if (device_supports_non_uniform_wg)
+    {
+        // Make sure we have a flexible range
+        global += 3 * local / 4;
+    }
     // Collect the data
     memset((void *)&result, 0xf0, sizeof(result));
@@ -327,4 +341,4 @@
     return test_work_item_functions(device, context, queue, num_elements,
\ No newline at end of file
diff --git a/test_conformance/submission_details_template.txt b/test_conformance/submission_details_template.txt
index 9d276a6..ff62483 100644
--- a/test_conformance/submission_details_template.txt
+++ b/test_conformance/submission_details_template.txt
@@ -81,6 +81,12 @@
 Tests version:
+# Commit SHAs (7-digit) of any cherry-picked patches subsequent to tagged
+# version. Any patches included must apply without conflicts to the tagged
+# version in the order listed.
 # Implementations that support cl_khr_icd are required to use a loader to run
 # the tests and document the loader that was used.
diff --git a/test_conformance/vectors/test_step.cpp b/test_conformance/vectors/test_step.cpp
index 2f6ad18..089bad2 100644
--- a/test_conformance/vectors/test_step.cpp
+++ b/test_conformance/vectors/test_step.cpp
@@ -172,6 +172,8 @@
                 return -1;
+            clStateDestroyProgramAndKernel(pClState);
diff --git a/test_conformance/vulkan/CMakeLists.txt b/test_conformance/vulkan/CMakeLists.txt
new file mode 100644
index 0000000..4f43172
--- /dev/null
+++ b/test_conformance/vulkan/CMakeLists.txt
@@ -0,0 +1,50 @@
+    list(APPEND CLConform_LIBRARIES vulkan-1)
+    list(APPEND CLConform_LIBRARIES vulkan dl)
+set(CMAKE_CXX_FLAGS "-fpermissive")
+include_directories (${CLConform_INCLUDE_DIR})
+        main.cpp
+        test_vulkan_interop_buffer.cpp
+        test_vulkan_interop_image.cpp
+        test_vulkan_api_consistency.cpp
+        test_vulkan_platform_device_info.cpp
+        vulkan_interop_common/vulkan_wrapper.cpp
+        vulkan_interop_common/vulkan_interop_common.cpp
+        vulkan_interop_common/opencl_vulkan_wrapper.cpp
+        vulkan_interop_common/vulkan_utility.cpp
+        vulkan_interop_common/vulkan_list_map.cpp
+        ../../test_common/harness/genericThread.cpp
+        ../../test_common/harness/errorHelpers.cpp
+        ../../test_common/harness/testHarness.cpp
+        ../../test_common/harness/kernelHelpers.cpp
+        ../../test_common/harness/mt19937.cpp
+        ../../test_common/harness/msvc9.c
+        ../../test_common/harness/parseParameters.cpp
+        ../../test_common/harness/deviceInfo.cpp
+        ../../test_common/harness/crc32.cpp
+    )
diff --git a/test_conformance/vulkan/main.cpp b/test_conformance/vulkan/main.cpp
new file mode 100644
index 0000000..2eeb0c3
--- /dev/null
+++ b/test_conformance/vulkan/main.cpp
@@ -0,0 +1,346 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <stdlib.h>
+#if !defined(_WIN32)
+#include <stdbool.h>
+#include <math.h>
+#include <string.h>
+#if !defined(__APPLE__)
+#include <CL/cl.h>
+#include <OpenCL/cl.h>
+#include "procs.h"
+#include "harness/testHarness.h"
+#include "harness/parseParameters.h"
+#include "harness/deviceInfo.h"
+#if !defined(_WIN32)
+#include <unistd.h>
+#include <vulkan_interop_common.hpp>
+#include <vulkan_wrapper.hpp>
+#define BUFFERSIZE 3000
+static void params_reset()
+    numCQ = 1;
+    multiImport = false;
+    multiCtx = false;
+extern int test_buffer_common(cl_device_id device_, cl_context context_,
+                              cl_command_queue queue_, int numElements_);
+extern int test_image_common(cl_device_id device_, cl_context context_,
+                             cl_command_queue queue_, int numElements_);
+int test_buffer_single_queue(cl_device_id device_, cl_context context_,
+                             cl_command_queue queue_, int numElements_)
+    params_reset();
+    log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_);
+int test_buffer_multiple_queue(cl_device_id device_, cl_context context_,
+                               cl_command_queue queue_, int numElements_)
+    params_reset();
+    numCQ = 2;
+    log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_);
+int test_buffer_multiImport_sameCtx(cl_device_id device_, cl_context context_,
+                                    cl_command_queue queue_, int numElements_)
+    params_reset();
+    multiImport = true;
+             "IN SAME CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_);
+int test_buffer_multiImport_diffCtx(cl_device_id device_, cl_context context_,
+                                    cl_command_queue queue_, int numElements_)
+    params_reset();
+    multiImport = true;
+    multiCtx = true;
+             "IN DIFFERENT CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_);
+int test_image_single_queue(cl_device_id device_, cl_context context_,
+                            cl_command_queue queue_, int numElements_)
+    params_reset();
+    log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
+    return test_image_common(device_, context_, queue_, numElements_);
+int test_image_multiple_queue(cl_device_id device_, cl_context context_,
+                              cl_command_queue queue_, int numElements_)
+    params_reset();
+    numCQ = 2;
+    log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
+    return test_image_common(device_, context_, queue_, numElements_);
+test_definition test_list[] = { ADD_TEST(buffer_single_queue),
+                                ADD_TEST(buffer_multiple_queue),
+                                ADD_TEST(buffer_multiImport_sameCtx),
+                                ADD_TEST(buffer_multiImport_diffCtx),
+                                ADD_TEST(image_single_queue),
+                                ADD_TEST(image_multiple_queue),
+                                ADD_TEST(consistency_external_buffer),
+                                ADD_TEST(consistency_external_image),
+                                ADD_TEST(consistency_external_semaphore),
+                                ADD_TEST(platform_info),
+                                ADD_TEST(device_info) };
+const int test_num = ARRAY_SIZE(test_list);
+cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
+char *choosen_platform_name = NULL;
+cl_platform_id platform = NULL;
+cl_int choosen_platform_index = -1;
+char platform_name[1024] = "";
+cl_platform_id select_platform = NULL;
+char *extensions = NULL;
+size_t extensionSize = 0;
+cl_uint num_devices = 0;
+cl_uint device_no = 0;
+cl_device_id *devices;
+const size_t bufsize = BUFFERSIZE;
+char buf[BUFFERSIZE];
+cl_uchar uuid[CL_UUID_SIZE_KHR];
+unsigned int numCQ;
+bool multiImport;
+bool multiCtx;
+bool debug_trace = false;
+bool useSingleImageKernel = false;
+bool useDeviceLocal = false;
+bool disableNTHandleType = false;
+bool enableOffset = false;
+bool non_dedicated = false;
+static void printUsage(const char *execName)
+    const char *p = strrchr(execName, '/');
+    if (p != NULL) execName = p + 1;
+    log_info("Usage: %s [test_names] [options]\n", execName);
+    log_info("Test names:\n");
+    for (int i = 0; i < test_num; i++)
+    {
+        log_info("\t%s\n", test_list[i].name);
+    }
+    log_info("\n");
+    log_info("Options:\n");
+    log_info("\t--debug_trace - Enables additional debug info logging\n");
+    log_info("\t--non_dedicated - Choose dedicated Vs. non_dedicated \n");
+size_t parseParams(int argc, const char *argv[], const char **argList)
+    size_t argCount = 1;
+    for (int i = 1; i < argc; i++)
+    {
+        if (argv[i] == NULL) break;
+        if (argv[i][0] == '-')
+        {
+            if (!strcmp(argv[i], "--debug_trace"))
+            {
+                debug_trace = true;
+            }
+            if (!strcmp(argv[i], "--useSingleImageKernel"))
+            {
+                useSingleImageKernel = true;
+            }
+            if (!strcmp(argv[i], "--useDeviceLocal"))
+            {
+                useDeviceLocal = true;
+            }
+            if (!strcmp(argv[i], "--disableNTHandleType"))
+            {
+                disableNTHandleType = true;
+            }
+            if (!strcmp(argv[i], "--enableOffset"))
+            {
+                enableOffset = true;
+            }
+            if (!strcmp(argv[i], "--non_dedicated"))
+            {
+                non_dedicated = true;
+            }
+            if (strcmp(argv[i], "-h") == 0)
+            {
+                printUsage(argv[0]);
+                argCount = 0; // Returning argCount=0 to assert error in main()
+                break;
+            }
+        }
+        else
+        {
+            argList[argCount] = argv[i];
+            argCount++;
+        }
+    }
+    return argCount;
+int main(int argc, const char *argv[])
+    int errNum = 0;
+    test_start();
+    params_reset();
+    if (!checkVkSupport())
+    {
+        log_info("Vulkan supported GPU not found \n");
+        log_info("TEST SKIPPED \n");
+        return 0;
+    }
+    VulkanDevice vkDevice;
+    cl_device_type requestedDeviceType = CL_DEVICE_TYPE_GPU;
+    char *force_cpu = getenv("CL_DEVICE_TYPE");
+    if (force_cpu != NULL)
+    {
+        if (strcmp(force_cpu, "gpu") == 0
+            || strcmp(force_cpu, "CL_DEVICE_TYPE_GPU") == 0)
+            requestedDeviceType = CL_DEVICE_TYPE_GPU;
+        else if (strcmp(force_cpu, "cpu") == 0
+                 || strcmp(force_cpu, "CL_DEVICE_TYPE_CPU") == 0)
+            requestedDeviceType = CL_DEVICE_TYPE_CPU;
+        else if (strcmp(force_cpu, "accelerator") == 0
+                 || strcmp(force_cpu, "CL_DEVICE_TYPE_ACCELERATOR") == 0)
+            requestedDeviceType = CL_DEVICE_TYPE_ACCELERATOR;
+        else if (strcmp(force_cpu, "CL_DEVICE_TYPE_DEFAULT") == 0)
+            requestedDeviceType = CL_DEVICE_TYPE_DEFAULT;
+    }
+    if (requestedDeviceType != CL_DEVICE_TYPE_GPU)
+    {
+        log_info("Vulkan tests can only run on a GPU device.\n");
+        return 0;
+    }
+    gDeviceType = CL_DEVICE_TYPE_GPU;
+    const char **argList = (const char **)calloc(argc, sizeof(char *));
+    size_t argCount = parseParams(argc, argv, argList);
+    if (argCount == 0) return 0;
+    // get the platform ID
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    if (errNum != CL_SUCCESS)
+    {
+        print_error(errNum, "Error: Failed to get platform\n");
+        return errNum;
+    }
+    errNum =
+        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "clGetDeviceIDs failed in returning of devices\n");
+        return errNum;
+    }
+    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+    if (NULL == devices)
+    {
+        print_error(errNum, "Unable to allocate memory for devices\n");
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+    errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices,
+                            NULL);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "Failed to get deviceID.\n");
+        return errNum;
+    }
+    for (device_no = 0; device_no < num_devices; device_no++)
+    {
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0,
+                                 NULL, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            log_error("Error in clGetDeviceInfo for getting "
+                      "device_extension size....\n");
+            return errNum;
+        }
+        extensions = (char *)malloc(extensionSize);
+        if (NULL == extensions)
+        {
+            log_error("Unable to allocate memory for extensions\n");
+            return CL_OUT_OF_HOST_MEMORY;
+        }
+        errNum =
+            clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS,
+                            extensionSize, extensions, NULL /*&extensionSize*/);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum,
+                        "Error in clGetDeviceInfo for getting "
+                        "device_extension\n");
+            return errNum;
+        }
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR,
+                                 CL_UUID_SIZE_KHR, uuid, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum, "clGetDeviceInfo failed with error\n ");
+            return errNum;
+        }
+        errNum =
+            memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE);
+        if (errNum == 0)
+        {
+            break;
+        }
+    }
+    if (device_no >= num_devices)
+    {
+        fprintf(stderr,
+                "OpenCL error: "
+                "No Vulkan-OpenCL Interop capable GPU found.\n");
+    }
+    if (!(is_extension_available(devices[device_no], "cl_khr_external_memory")
+          && is_extension_available(devices[device_no],
+                                    "cl_khr_external_semaphore")))
+    {
+        log_info("Device does not support cl_khr_external_memory "
+                 "or cl_khr_external_semaphore\n");
+        log_info(" TEST SKIPPED\n");
+        return CL_SUCCESS;
+    }
+    init_cl_vk_ext(platform);
+    // Execute tests.
+    // Note: don't use the entire harness, because we have a different way of
+    // obtaining the device (via the context)
+    errNum = parseAndCallCommandLineTests(argCount, argList, devices[device_no],
+                                          test_num, test_list, true, 0, 1024);
+    return errNum;
diff --git a/test_conformance/vulkan/procs.h b/test_conformance/vulkan/procs.h
new file mode 100644
index 0000000..37bf786
--- /dev/null
+++ b/test_conformance/vulkan/procs.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "harness/mt19937.h"
+extern int test_vulkan_interop_buffer(cl_device_id device, cl_context context,
+                                      cl_command_queue queue, int num_elements);
+extern int test_vulkan_interop_image(cl_device_id device, cl_context context,
+                                     cl_command_queue queue, int num_elements);
+extern int test_consistency_external_buffer(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements);
+extern int test_consistency_external_image(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements);
+extern int test_consistency_external_semaphore(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+extern int test_platform_info(cl_device_id device, cl_context context,
+                              cl_command_queue queue, int num_elements);
+extern int test_device_info(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements);
diff --git a/test_conformance/vulkan/shaders/buffer.comp b/test_conformance/vulkan/shaders/buffer.comp
new file mode 100644
index 0000000..d8756f9
--- /dev/null
+++ b/test_conformance/vulkan/shaders/buffer.comp
@@ -0,0 +1,28 @@
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8    : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32   : enable
+#define MAX_BUFFERS 5
+layout(binding = 0) buffer Params
+  uint32_t numBuffers;
+  uint32_t bufferSize;
+  uint32_t interBufferOffset;
+layout(binding = 1) buffer Buffer
+  uint8_t ptr[];
+} bufferPtrList[MAX_BUFFERS];
+layout(local_size_x = 512) in;
+void main() {
+    for (uint32_t bufIdx = 0; bufIdx < numBuffers; bufIdx++) {
+        uint32_t ptrIdx = gl_GlobalInvocationID.x;
+        uint32_t limit = bufferSize;
+        while (ptrIdx < limit) {
+            bufferPtrList[bufIdx].ptr[ptrIdx]++;
+            ptrIdx += (gl_NumWorkGroups.x * gl_WorkGroupSize.x);
+        }
+    }
\ No newline at end of file
diff --git a/test_conformance/vulkan/shaders/buffer.spv b/test_conformance/vulkan/shaders/buffer.spv
new file mode 100644
index 0000000..685523b
--- /dev/null
+++ b/test_conformance/vulkan/shaders/buffer.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D.comp b/test_conformance/vulkan/shaders/image2D.comp
new file mode 100644
index 0000000..42fa2f7
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D.comp
@@ -0,0 +1,31 @@
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32   : enable
+#define MAX_2D_IMAGES               5
+#define MAX_2D_IMAGE_MIP_LEVELS     11
+layout(binding = 0) buffer Params
+    uint32_t numImage2DDescriptors;
+layout(binding = 1, rgba32f ) uniform image2D image2DList[ MAX_2D_IMAGE_DESCRIPTORS ];
+layout(local_size_x = 32, local_size_y = 32) in;
+void main() {
+    uvec3 numThreads = gl_NumWorkGroups * gl_WorkGroupSize;
+    for (uint32_t image2DIdx = 0; image2DIdx < numImage2DDescriptors; image2DIdx++) {
+        ivec2 imageDim = imageSize(image2DList[image2DIdx]);
+        uint32_t heightBy2 = imageDim.y / 2;
+        for (uint32_t row = gl_GlobalInvocationID.y; row < heightBy2; row += numThreads.y) {
+            for (uint32_t col = gl_GlobalInvocationID.x; col < imageDim.x; col += numThreads.x) {
+                ivec2 coordsA = ivec2(col, row);
+                ivec2 coordsB = ivec2(col, imageDim.y - row - 1);
+                vec4 dataA = imageLoad(image2DList[image2DIdx], coordsA);
+                vec4 dataB = imageLoad(image2DList[image2DIdx], coordsB);
+                imageStore(image2DList[image2DIdx], coordsA, dataB);
+                imageStore(image2DList[image2DIdx], coordsB, dataA);
+            }
+        }
+    }
\ No newline at end of file
diff --git a/test_conformance/vulkan/shaders/image2D_r16i.spv b/test_conformance/vulkan/shaders/image2D_r16i.spv
new file mode 100644
index 0000000..00c5c28
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r16i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r16ui.spv b/test_conformance/vulkan/shaders/image2D_r16ui.spv
new file mode 100644
index 0000000..87514d9
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r16ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r32f.spv b/test_conformance/vulkan/shaders/image2D_r32f.spv
new file mode 100644
index 0000000..e82c9c1
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r32f.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r32i.spv b/test_conformance/vulkan/shaders/image2D_r32i.spv
new file mode 100644
index 0000000..7ea8d26
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r32i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r32ui.spv b/test_conformance/vulkan/shaders/image2D_r32ui.spv
new file mode 100644
index 0000000..dbcdbc5
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r32ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r8i.spv b/test_conformance/vulkan/shaders/image2D_r8i.spv
new file mode 100644
index 0000000..1a64147
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r8i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_r8ui.spv b/test_conformance/vulkan/shaders/image2D_r8ui.spv
new file mode 100644
index 0000000..a90ccf9
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_r8ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg16i.spv b/test_conformance/vulkan/shaders/image2D_rg16i.spv
new file mode 100644
index 0000000..0799617
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg16i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg16ui.spv b/test_conformance/vulkan/shaders/image2D_rg16ui.spv
new file mode 100644
index 0000000..f73e096
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg16ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg32f.spv b/test_conformance/vulkan/shaders/image2D_rg32f.spv
new file mode 100644
index 0000000..1489660
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg32f.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg32i.spv b/test_conformance/vulkan/shaders/image2D_rg32i.spv
new file mode 100644
index 0000000..b7d302f
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg32i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg32ui.spv b/test_conformance/vulkan/shaders/image2D_rg32ui.spv
new file mode 100644
index 0000000..6cf2f1b
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg32ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg8i.spv b/test_conformance/vulkan/shaders/image2D_rg8i.spv
new file mode 100644
index 0000000..a71b9bf
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg8i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rg8ui.spv b/test_conformance/vulkan/shaders/image2D_rg8ui.spv
new file mode 100644
index 0000000..2aca929
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rg8ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba16i.spv b/test_conformance/vulkan/shaders/image2D_rgba16i.spv
new file mode 100644
index 0000000..0cb95df
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba16i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba16ui.spv b/test_conformance/vulkan/shaders/image2D_rgba16ui.spv
new file mode 100644
index 0000000..84c3d3d
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba16ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba32f.spv b/test_conformance/vulkan/shaders/image2D_rgba32f.spv
new file mode 100644
index 0000000..35136c5
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba32f.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba32i.spv b/test_conformance/vulkan/shaders/image2D_rgba32i.spv
new file mode 100644
index 0000000..4d1ae58
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba32i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba32ui.spv b/test_conformance/vulkan/shaders/image2D_rgba32ui.spv
new file mode 100644
index 0000000..bed86f0
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba32ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba8i.spv b/test_conformance/vulkan/shaders/image2D_rgba8i.spv
new file mode 100644
index 0000000..edf8c58
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba8i.spv
Binary files differ
diff --git a/test_conformance/vulkan/shaders/image2D_rgba8ui.spv b/test_conformance/vulkan/shaders/image2D_rgba8ui.spv
new file mode 100644
index 0000000..bb9a770
--- /dev/null
+++ b/test_conformance/vulkan/shaders/image2D_rgba8ui.spv
Binary files differ
diff --git a/test_conformance/vulkan/test_vulkan_api_consistency.cpp b/test_conformance/vulkan/test_vulkan_api_consistency.cpp
new file mode 100644
index 0000000..f22ac31
--- /dev/null
+++ b/test_conformance/vulkan/test_vulkan_api_consistency.cpp
@@ -0,0 +1,568 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vulkan_interop_common.hpp>
+#include <opencl_vulkan_wrapper.hpp>
+#include <vulkan_wrapper.hpp>
+#if !defined(__APPLE__)
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_ext.h>
+#include <assert.h>
+#include <vector>
+#include <iostream>
+#include <string.h>
+#include "harness/testHarness.h"
+#include "harness/typeWrappers.h"
+#include "harness/deviceInfo.h"
+int test_consistency_external_buffer(cl_device_id deviceID, cl_context _context,
+                                     cl_command_queue _queue, int num_elements)
+    cl_int errNum;
+    VulkanDevice vkDevice;
+    // Context and command queue creation
+    cl_platform_id platform = NULL;
+    cl_context context = NULL;
+    cl_command_queue cmd_queue = NULL;
+    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    test_error(errNum, "Failed to get platform Id");
+    contextProperties[1] = (cl_context_properties)platform;
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &errNum);
+    test_error(errNum, "Unable to create context with properties");
+    cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum);
+    test_error(errNum, "Unable to create command queue");
+    uint32_t bufferSize = 32;
+    cl_device_id devList[] = { deviceID, NULL };
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_win32"))
+    {
+        throw std::runtime_error("Device does not support "
+                                 "cl_khr_external_memory_win32 extension \n");
+    }
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support "
+            "cl_khr_external_memory_opaque_fd extension \n");
+    }
+    VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+        getSupportedVulkanExternalMemoryHandleTypeList()[0];
+    VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, vkExternalMemoryHandleType);
+    const VulkanMemoryTypeList& memoryTypeList =
+        vkDummyBuffer.getMemoryTypeList();
+    VulkanDeviceMemory* vkDeviceMem = new VulkanDeviceMemory(
+        vkDevice, bufferSize, memoryTypeList[0], vkExternalMemoryHandleType);
+    VulkanBufferList vkBufferList(1, vkDevice, bufferSize,
+                                  vkExternalMemoryHandleType);
+    vkDeviceMem->bindBuffer(vkBufferList[0], 0);
+    void* handle = NULL;
+    int fd;
+    std::vector<cl_mem_properties> extMemProperties{
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR,
+        (cl_mem_properties)devList[0],
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR,
+    };
+    cl_external_memory_handle_type_khr type;
+    switch (vkExternalMemoryHandleType)
+    {
+#ifdef _WIN32
+            handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(devList[0], type);
+            extMemProperties.push_back((cl_mem_properties)type);
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+            handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(devList[0], type);
+            extMemProperties.push_back((cl_mem_properties)type);
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+            fd = (int)vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(devList[0], type);
+            extMemProperties.push_back((cl_mem_properties)type);
+            extMemProperties.push_back((cl_mem_properties)fd);
+            break;
+        default:
+            errNum = TEST_FAIL;
+            log_error("Unsupported external memory handle type \n");
+            break;
+    }
+    if (errNum != CL_SUCCESS)
+    {
+        log_error("Checks failed for "
+        return TEST_FAIL;
+    }
+    extMemProperties.push_back(0);
+    clMemWrapper buffer;
+    // Passing NULL properties and a valid extMem_desc size
+    buffer = clCreateBufferWithProperties(context, NULL, 1, bufferSize, NULL,
+                                          &errNum);
+    test_error(errNum, "Unable to create buffer with NULL properties");
+    buffer.reset();
+    // Passing valid extMemProperties and buffersize
+    buffer = clCreateBufferWithProperties(context,, 1,
+                                          bufferSize, NULL, &errNum);
+    test_error(errNum, "Unable to create buffer with Properties");
+    buffer.reset();
+    // Not passing external memory handle
+    std::vector<cl_mem_properties> extMemProperties2{
+#ifdef _WIN32
+        (cl_mem_properties)type,
+        NULL, // Passing NULL handle
+        (cl_mem_properties)type,
+        (cl_mem_properties)-64, // Passing random invalid fd
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR,
+        (cl_mem_properties)devList[0],
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR,
+        0
+    };
+    buffer = clCreateBufferWithProperties(context,, 1,
+                                          bufferSize, NULL, &errNum);
+    test_failure_error(errNum, CL_INVALID_VALUE,
+                       "Should return CL_INVALID_VALUE ");
+    buffer.reset();
+    // Passing extMem_desc size = 0 but valid memProperties, CL_INVALID_SIZE
+    // should be returned.
+    buffer = clCreateBufferWithProperties(context,, 1,
+                                          0, NULL, &errNum);
+    test_failure_error(errNum, CL_INVALID_BUFFER_SIZE,
+                       "Should return CL_INVALID_BUFFER_SIZE");
+    return TEST_PASS;
+int test_consistency_external_image(cl_device_id deviceID, cl_context _context,
+                                    cl_command_queue _queue, int num_elements)
+    cl_int errNum;
+    VulkanDevice vkDevice;
+    // Context and command queue creation
+    cl_platform_id platform = NULL;
+    cl_context context = NULL;
+    cl_command_queue cmd_queue = NULL;
+    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    test_error(errNum, "Failed to get platform id");
+    contextProperties[1] = (cl_context_properties)platform;
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &errNum);
+    test_error(errNum, "Unable to create context with properties");
+    cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum);
+    test_error(errNum, "Unable to create command queue");
+    cl_device_id devList[] = { deviceID, NULL };
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_win32"))
+    {
+        throw std::runtime_error("Device does not support"
+                                 "cl_khr_external_memory_win32 extension \n");
+    }
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_memory_opaque_fd "
+            "extension \n");
+    }
+    uint32_t width = 256;
+    uint32_t height = 16;
+    cl_image_desc image_desc;
+    memset(&image_desc, 0x0, sizeof(cl_image_desc));
+    cl_image_format img_format = { 0 };
+    VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+        getSupportedVulkanExternalMemoryHandleTypeList()[0];
+    VulkanImage2D* vkImage2D =
+        new VulkanImage2D(vkDevice, VULKAN_FORMAT_R8G8B8A8_UNORM, width, height,
+                          1, vkExternalMemoryHandleType);
+    const VulkanMemoryTypeList& memoryTypeList = vkImage2D->getMemoryTypeList();
+    uint64_t totalImageMemSize = vkImage2D->getSize();
+    log_info("Memory type index: %lu\n", (uint32_t)memoryTypeList[0]);
+    log_info("Memory type property: %d\n",
+             memoryTypeList[0].getMemoryTypeProperty());
+    log_info("Image size : %d\n", totalImageMemSize);
+    VulkanDeviceMemory* vkDeviceMem =
+        new VulkanDeviceMemory(vkDevice, totalImageMemSize, memoryTypeList[0],
+                               vkExternalMemoryHandleType);
+    vkDeviceMem->bindImage(*vkImage2D, 0);
+    void* handle = NULL;
+    int fd;
+    std::vector<cl_mem_properties> extMemProperties{
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR,
+        (cl_mem_properties)devList[0],
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR,
+    };
+    switch (vkExternalMemoryHandleType)
+    {
+#ifdef _WIN32
+            handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+            handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+            fd = (int)vkDeviceMem->getHandle(vkExternalMemoryHandleType);
+            errNum = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties.push_back((cl_mem_properties)fd);
+            break;
+        default:
+            errNum = TEST_FAIL;
+            log_error("Unsupported external memory handle type \n");
+            break;
+    }
+    if (errNum != CL_SUCCESS)
+    {
+        log_error("Checks failed for "
+        return TEST_FAIL;
+    }
+    extMemProperties.push_back(0);
+    const VkImageCreateInfo VulkanImageCreateInfo =
+        vkImage2D->getVkImageCreateInfo();
+    errNum = getCLImageInfoFromVkImageInfo(
+        &VulkanImageCreateInfo, totalImageMemSize, &img_format, &image_desc);
+    if (errNum != CL_SUCCESS)
+    {
+        log_error("getCLImageInfoFromVkImageInfo failed!!!");
+        return TEST_FAIL;
+    }
+    clMemWrapper image;
+    // Pass valid properties, image_desc and image_format
+    image = clCreateImageWithProperties(
+        context,, CL_MEM_READ_WRITE, &img_format,
+        &image_desc, NULL /* host_ptr */, &errNum);
+    test_error(errNum, "Unable to create Image with Properties");
+    image.reset();
+    // Passing properties, image_desc and image_format all as NULL
+    image = clCreateImageWithProperties(context, NULL, CL_MEM_READ_WRITE, NULL,
+                                        NULL, NULL, &errNum);
+    test_failure_error(
+        "Image creation must fail with CL_INVALID_IMAGE_DESCRIPTOR "
+        "when all are passed as NULL");
+    image.reset();
+    // Passing NULL properties and a valid image_format and image_desc
+    image =
+        clCreateImageWithProperties(context, NULL, CL_MEM_READ_WRITE,
+                                    &img_format, &image_desc, NULL, &errNum);
+    test_error(errNum,
+               "Unable to create image with NULL properties "
+               "with valid image format and image desc");
+    image.reset();
+    // Passing image_format as NULL
+    image = clCreateImageWithProperties(context,,
+                                        CL_MEM_READ_WRITE, NULL, &image_desc,
+                                        NULL, &errNum);
+    test_failure_error(errNum, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                       "Image creation must fail with "
+                       "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"
+                       "when image desc passed as NULL");
+    image.reset();
+    // Passing image_desc as NULL
+    image = clCreateImageWithProperties(context,,
+                                        CL_MEM_READ_WRITE, &img_format, NULL,
+                                        NULL, &errNum);
+    test_failure_error(errNum, CL_INVALID_IMAGE_DESCRIPTOR,
+                       "Image creation must fail with "
+                       "CL_INVALID_IMAGE_DESCRIPTOR "
+                       "when image desc passed as NULL");
+    image.reset();
+    return TEST_PASS;
+int test_consistency_external_semaphore(cl_device_id deviceID,
+                                        cl_context _context,
+                                        cl_command_queue _queue,
+                                        int num_elements)
+    cl_int errNum;
+    VulkanDevice vkDevice;
+    // Context and command queue creation
+    cl_platform_id platform = NULL;
+    cl_context context = NULL;
+    cl_command_queue cmd_queue = NULL;
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    test_error(errNum, "Failed to get platform Id");
+    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    contextProperties[1] = (cl_context_properties)platform;
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &errNum);
+    test_error(errNum, "Unable to create context with properties");
+    cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum);
+    test_error(errNum, "Unable to create command queue");
+    cl_device_id devList[] = { deviceID, NULL };
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_semaphore_win32 "
+            "extension \n");
+    }
+    if (!is_extension_available(devList[0],
+                                "cl_khr_external_semaphore_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support "
+            "cl_khr_external_semaphore_opaque_fd extension \n");
+    }
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2Clsemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2Vksemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    cl_semaphore_khr clCl2Vksemaphore;
+    cl_semaphore_khr clVk2Clsemaphore;
+    void* handle1 = NULL;
+    void* handle2 = NULL;
+    int fd1, fd2;
+    std::vector<cl_semaphore_properties_khr> sema_props1{
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR,
+    };
+    std::vector<cl_semaphore_properties_khr> sema_props2{
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR,
+    };
+    switch (vkExternalSemaphoreHandleType)
+    {
+#ifdef _WIN32
+            log_info(" Opaque NT handles are only supported on Windows\n");
+            handle1 = vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType);
+            handle2 = vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType);
+            errNum = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR);
+            sema_props1.push_back((cl_semaphore_properties_khr)
+                                      CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR);
+            sema_props1.push_back((cl_semaphore_properties_khr)handle1);
+            sema_props2.push_back((cl_semaphore_properties_khr)
+                                      CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR);
+            sema_props2.push_back((cl_semaphore_properties_khr)handle2);
+            break;
+            log_info(" Opaque D3DKMT handles are only supported on Windows\n");
+            handle1 = vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType);
+            handle2 = vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType);
+            errNum = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            sema_props1.push_back((cl_semaphore_properties_khr)
+                                      CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            sema_props1.push_back((cl_semaphore_properties_khr)handle1);
+            sema_props2.push_back((cl_semaphore_properties_khr)
+                                      CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            sema_props2.push_back((cl_semaphore_properties_khr)handle2);
+            break;
+            log_info(" Opaque file descriptors are not supported on Windows\n");
+            fd1 =
+                (int)vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType);
+            fd2 =
+                (int)vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType);
+            errNum = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
+            sema_props1.push_back(
+                (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
+            sema_props1.push_back((cl_semaphore_properties_khr)fd1);
+            sema_props2.push_back(
+                (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
+            sema_props2.push_back((cl_semaphore_properties_khr)fd2);
+            break;
+        default: log_error("Unsupported external memory handle type\n"); break;
+    }
+    if (CL_SUCCESS != errNum)
+    {
+        throw std::runtime_error(
+            "Unsupported external sempahore handle type\n ");
+    }
+    sema_props1.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR);
+    sema_props1.push_back((cl_semaphore_properties_khr)devList[0]);
+    sema_props1.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR);
+    sema_props2.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR);
+    sema_props2.push_back((cl_semaphore_properties_khr)devList[0]);
+    sema_props2.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR);
+    sema_props1.push_back(0);
+    sema_props2.push_back(0);
+    // Pass NULL properties
+    cl_semaphore_khr cl_ext_semaphore =
+        clCreateSemaphoreWithPropertiesKHRptr(context, NULL, &errNum);
+    test_failure_error(errNum, CL_INVALID_VALUE,
+                       "Semaphore creation must fail with CL_INVALID_VALUE "
+                       " when properties are passed as NULL");
+    // Pass invalid semaphore object to wait
+    errNum =
+        clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, NULL, NULL, 0, NULL, NULL);
+    test_failure_error(errNum, CL_INVALID_VALUE,
+                       "clEnqueueWaitSemaphoresKHR fails with CL_INVALID_VALUE "
+                       "when invalid semaphore object is passed");
+    // Pass invalid semaphore object to signal
+    errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, NULL, NULL, 0, NULL,
+                                             NULL);
+    test_failure_error(
+        errNum, CL_INVALID_VALUE,
+        "clEnqueueSignalSemaphoresKHR fails with CL_INVALID_VALUE"
+        "when invalid semaphore object is passed");
+    // Create two semaphore objects
+    clVk2Clsemaphore = clCreateSemaphoreWithPropertiesKHRptr(
+        context,, &errNum);
+    test_error(errNum,
+               "Unable to create semaphore with valid semaphore properties");
+    clCl2Vksemaphore = clCreateSemaphoreWithPropertiesKHRptr(
+        context,, &errNum);
+    test_error(errNum,
+               "Unable to create semaphore with valid semaphore properties");
+    // Call Signal twice consecutively
+    errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &clVk2Clsemaphore,
+                                             NULL, 0, NULL, NULL);
+    test_error(errNum, "clEnqueueSignalSemaphoresKHRptr failed");
+    errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &clCl2Vksemaphore,
+                                             NULL, 0, NULL, NULL);
+    test_error(errNum,
+               "clEnqueueSignalSemaphoresKHRptr failed for two "
+               "consecutive wait events");
+    // Call Wait twice consecutively
+    errNum = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &clVk2Clsemaphore,
+                                           NULL, 0, NULL, NULL);
+    test_error(errNum, "clEnqueueWaitSemaphoresKHRptr failed");
+    errNum = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &clCl2Vksemaphore,
+                                           NULL, 0, NULL, NULL);
+    test_error(errNum,
+               "clEnqueueWaitSemaphoresKHRptr failed for two "
+               " consecutive wait events");
+    // Pass invalid object to release call
+    errNum = clReleaseSemaphoreKHRptr(NULL);
+    test_failure_error(errNum, CL_INVALID_VALUE,
+                       "clReleaseSemaphoreKHRptr fails with "
+                       "CL_INVALID_VALUE when NULL semaphore object is passed");
+    // Release both semaphore objects
+    errNum = clReleaseSemaphoreKHRptr(clVk2Clsemaphore);
+    test_error(errNum, "clReleaseSemaphoreKHRptr failed");
+    errNum = clReleaseSemaphoreKHRptr(clCl2Vksemaphore);
+    test_error(errNum, "clReleaseSemaphoreKHRptr failed");
+    return TEST_PASS;
diff --git a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
new file mode 100644
index 0000000..9b0bc9d
--- /dev/null
+++ b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
@@ -0,0 +1,1786 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vulkan_interop_common.hpp>
+#include <vulkan_wrapper.hpp>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <assert.h>
+#include <vector>
+#include <iostream>
+#include <string.h>
+#include "harness/errorHelpers.h"
+#define MAX_BUFFERS 5
+#define MAX_IMPORTS 5
+#define BUFFERSIZE 3000
+static cl_uchar uuid[CL_UUID_SIZE_KHR];
+static cl_device_id deviceId = NULL;
+namespace {
+struct Params
+    uint32_t numBuffers;
+    uint32_t bufferSize;
+    uint32_t interBufferOffset;
+const char *kernel_text_numbuffer_1 = " \
+__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a) {  \n\
+    int gid = get_global_id(0); \n\
+    if (gid < bufferSize) { \n\
+        a[gid]++; \n\
+    } \n\
+const char *kernel_text_numbuffer_2 = " \
+__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a, __global unsigned char *b) {  \n\
+    int gid = get_global_id(0); \n\
+    if (gid < bufferSize) { \n\
+        a[gid]++; \n\
+        b[gid]++;\n\
+    } \n\
+const char *kernel_text_numbuffer_4 = " \
+__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a, __global unsigned char *b, __global unsigned char *c, __global unsigned char *d) {  \n\
+    int gid = get_global_id(0); \n\
+    if (gid < bufferSize) { \n\
+        a[gid]++;\n\
+        b[gid]++; \n\
+        c[gid]++; \n\
+        d[gid]++; \n\
+    } \n\
+const char *kernel_text_verify = " \
+__kernel void checkKernel(__global unsigned char *ptr, int size, int expVal, __global unsigned char *err)     \n\
+{                                                                                         \n\
+    int idx = get_global_id(0);                                                           \n\
+    if ((idx < size) && (*err == 0)) { \n\
+    if (ptr[idx] != expVal){           \n\
+            *err = 1;                  \n\
+           }                           \n\
+        }                              \n\
+int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
+                            cl_command_queue &cmd_queue2, cl_kernel *kernel,
+                            cl_kernel &verify_kernel, VulkanDevice &vkDevice,
+                            uint32_t numBuffers, uint32_t bufferSize)
+    int err = CL_SUCCESS;
+    size_t global_work_size[1];
+    uint8_t *error_2;
+    cl_mem error_1;
+    cl_kernel update_buffer_kernel;
+    cl_kernel kernel_cq;
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    const char *program_source_const = kernel_text_numbuffer_2;
+    size_t program_source_length = strlen(program_source_const);
+    cl_program program = clCreateProgramWithSource(
+        context, 1, &program_source_const, &program_source_length, &err);
+    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        print_error(err, "Error: Failed to build program \n");
+        return err;
+    }
+    // create the kernel
+    kernel_cq = clCreateKernel(program, "clUpdateBuffer", &err);
+    if (err != CL_SUCCESS)
+    {
+        print_error(err, "clCreateKernel failed \n");
+        return err;
+    }
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    std::vector<char> vkBufferShader = readFile("buffer.spv");
+    VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                            vkBufferShaderModule);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    const uint32_t maxIter = innerIterations;
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory;
+    std::vector<clExternalMemory *> externalMemory;
+    for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size();
+         emhtIdx++)
+    {
+        VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+            vkExternalMemoryHandleTypeList[emhtIdx];
+        log_info("External memory handle type: %d\n",
+                 vkExternalMemoryHandleType);
+        VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024,
+                                   vkExternalMemoryHandleType);
+        const VulkanMemoryTypeList &memoryTypeList =
+            vkDummyBuffer.getMemoryTypeList();
+        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+        {
+            const VulkanMemoryType &memoryType = memoryTypeList[mtIdx];
+            log_info("Memory type index: %d\n", (uint32_t)memoryType);
+            log_info("Memory type property: %d\n",
+                     memoryType.getMemoryTypeProperty());
+            VulkanBufferList vkBufferList(numBuffers, vkDevice, bufferSize,
+                                          vkExternalMemoryHandleType);
+            for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
+            {
+                vkBufferListDeviceMemory.push_back(
+                    new VulkanDeviceMemory(vkDevice, bufferSize, memoryType,
+                                           vkExternalMemoryHandleType));
+                externalMemory.push_back(new clExternalMemory(
+                    vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType,
+                    0, bufferSize, context, deviceId));
+            }
+            cl_mem buffers[MAX_BUFFERS];
+            clFinish(cmd_queue1);
+            Params *params = (Params *);
+            params->numBuffers = numBuffers;
+            params->bufferSize = bufferSize;
+            params->interBufferOffset = 0;
+            vkParamsDeviceMemory.unmap();
+            vkDescriptorSet.update(0, vkParamsBuffer);
+            for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++)
+            {
+                size_t buffer_size = vkBufferList[bIdx].getSize();
+                vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx],
+                                                           0);
+                buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer();
+                vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]);
+            }
+            vkCommandBuffer.begin();
+            vkCommandBuffer.bindPipeline(vkComputePipeline);
+            vkCommandBuffer.bindDescriptorSets(
+                vkComputePipeline, vkPipelineLayout, vkDescriptorSet);
+            vkCommandBuffer.dispatch(512, 1, 1);
+            vkCommandBuffer.end();
+            if (vkBufferList.size() == 2)
+            {
+                update_buffer_kernel = kernel[0];
+            }
+            else if (vkBufferList.size() == 3)
+            {
+                update_buffer_kernel = kernel[1];
+            }
+            else if (vkBufferList.size() == 5)
+            {
+                update_buffer_kernel = kernel[2];
+            }
+            // global work size should be less than or equal to
+            // bufferSizeList[i]
+            global_work_size[0] = bufferSize;
+            for (uint32_t iter = 0; iter < maxIter; iter++)
+            {
+                if (iter == 0)
+                {
+                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                }
+                else
+                {
+                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                   vkVk2CLSemaphore);
+                }
+                clVk2CLExternalSemaphore->wait(cmd_queue1);
+                err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
+                                     (void *)&bufferSize);
+                err |= clSetKernelArg(kernel_cq, 0, sizeof(uint32_t),
+                                      (void *)&bufferSize);
+                err |= clSetKernelArg(kernel_cq, 1, sizeof(cl_mem),
+                                      (void *)&(buffers[0]));
+                for (int i = 0; i < vkBufferList.size() - 1; i++)
+                {
+                    err |=
+                        clSetKernelArg(update_buffer_kernel, i + 1,
+                                       sizeof(cl_mem), (void *)&(buffers[i]));
+                }
+                err |=
+                    clSetKernelArg(kernel_cq, 2, sizeof(cl_mem),
+                                   (void *)&(buffers[vkBufferList.size() - 1]));
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to set arg values for kernel\n");
+                    goto CLEANUP;
+                }
+                cl_event first_launch;
+                err = clEnqueueNDRangeKernel(cmd_queue1, update_buffer_kernel,
+                                             1, NULL, global_work_size, NULL, 0,
+                                             NULL, &first_launch);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to launch update_buffer_kernel,"
+                                "error\n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueNDRangeKernel(cmd_queue2, kernel_cq, 1, NULL,
+                                             global_work_size, NULL, 1,
+                                             &first_launch, NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to launch update_buffer_kernel,"
+                                "error\n");
+                    goto CLEANUP;
+                }
+                if (iter != (maxIter - 1))
+                {
+                    clCl2VkExternalSemaphore->signal(cmd_queue2);
+                }
+            }
+            error_2 = (uint8_t *)malloc(sizeof(uint8_t));
+            if (NULL == error_2)
+            {
+                log_error("Not able to allocate memory\n");
+                goto CLEANUP;
+            }
+            clFinish(cmd_queue2);
+            error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                     sizeof(uint8_t), NULL, &err);
+            if (CL_SUCCESS != err)
+            {
+                print_error(err, "Error: clCreateBuffer \n");
+                goto CLEANUP;
+            }
+            uint8_t val = 0;
+            err = clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                       sizeof(uint8_t), &val, 0, NULL, NULL);
+            if (err != CL_SUCCESS)
+            {
+                print_error(err, "Error: Failed read output, error\n");
+                goto CLEANUP;
+            }
+            int calc_max_iter;
+            for (int i = 0; i < vkBufferList.size(); i++)
+            {
+                if (i == 0)
+                    calc_max_iter = (maxIter * 3);
+                else
+                    calc_max_iter = (maxIter * 2);
+                err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem),
+                                     (void *)&(buffers[i]));
+                err |=
+                    clSetKernelArg(verify_kernel, 1, sizeof(int), &bufferSize);
+                err |= clSetKernelArg(verify_kernel, 2, sizeof(int),
+                                      &calc_max_iter);
+                err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem),
+                                      (void *)&error_1);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to set arg values for "
+                                "verify_kernel \n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, NULL,
+                                             global_work_size, NULL, 0, NULL,
+                                             NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to launch verify_kernel,"
+                                "error \n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                          sizeof(uint8_t), error_2, 0, NULL,
+                                          NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err, "Error: Failed read output, error \n ");
+                    goto CLEANUP;
+                }
+                if (*error_2 == 1)
+                {
+                    log_error("&&&& vulkan_opencl_buffer test FAILED\n");
+                    goto CLEANUP;
+                }
+            }
+            for (size_t i = 0; i < vkBufferList.size(); i++)
+            {
+                delete vkBufferListDeviceMemory[i];
+                delete externalMemory[i];
+            }
+            vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(),
+                                           vkBufferListDeviceMemory.begin()
+                                               + numBuffers);
+            externalMemory.erase(externalMemory.begin(),
+                                 externalMemory.begin() + numBuffers);
+        }
+    }
+    for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+    {
+        if (vkBufferListDeviceMemory[i])
+        {
+            delete vkBufferListDeviceMemory[i];
+        }
+        if (externalMemory[i])
+        {
+            delete externalMemory[i];
+        }
+    }
+    if (program) clReleaseProgram(program);
+    if (kernel_cq) clReleaseKernel(kernel_cq);
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (error_2) free(error_2);
+    if (error_1) clReleaseMemObject(error_1);
+    return err;
+int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
+                            cl_kernel *kernel, cl_kernel &verify_kernel,
+                            VulkanDevice &vkDevice, uint32_t numBuffers,
+                            uint32_t bufferSize)
+    log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
+    size_t global_work_size[1];
+    uint8_t *error_2;
+    cl_mem error_1;
+    cl_kernel update_buffer_kernel;
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    int err = CL_SUCCESS;
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    std::vector<char> vkBufferShader = readFile("buffer.spv");
+    VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                            vkBufferShaderModule);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    const uint32_t maxIter = innerIterations;
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory;
+    std::vector<clExternalMemory *> externalMemory;
+    for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size();
+         emhtIdx++)
+    {
+        VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+            vkExternalMemoryHandleTypeList[emhtIdx];
+        log_info("External memory handle type: %d\n",
+                 vkExternalMemoryHandleType);
+        VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024,
+                                   vkExternalMemoryHandleType);
+        const VulkanMemoryTypeList &memoryTypeList =
+            vkDummyBuffer.getMemoryTypeList();
+        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+        {
+            const VulkanMemoryType &memoryType = memoryTypeList[mtIdx];
+            log_info("Memory type index: %d\n", (uint32_t)memoryType);
+            log_info("Memory type property: %d\n",
+                     memoryType.getMemoryTypeProperty());
+            VulkanBufferList vkBufferList(numBuffers, vkDevice, bufferSize,
+                                          vkExternalMemoryHandleType);
+            for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
+            {
+                vkBufferListDeviceMemory.push_back(
+                    new VulkanDeviceMemory(vkDevice, bufferSize, memoryType,
+                                           vkExternalMemoryHandleType));
+                externalMemory.push_back(new clExternalMemory(
+                    vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType,
+                    0, bufferSize, context, deviceId));
+            }
+            cl_mem buffers[4];
+            clFinish(cmd_queue1);
+            Params *params = (Params *);
+            params->numBuffers = numBuffers;
+            params->bufferSize = bufferSize;
+            params->interBufferOffset = 0;
+            vkParamsDeviceMemory.unmap();
+            vkDescriptorSet.update(0, vkParamsBuffer);
+            for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++)
+            {
+                size_t buffer_size = vkBufferList[bIdx].getSize();
+                vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx],
+                                                           0);
+                buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer();
+                vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]);
+            }
+            vkCommandBuffer.begin();
+            vkCommandBuffer.bindPipeline(vkComputePipeline);
+            vkCommandBuffer.bindDescriptorSets(
+                vkComputePipeline, vkPipelineLayout, vkDescriptorSet);
+            vkCommandBuffer.dispatch(512, 1, 1);
+            vkCommandBuffer.end();
+            if (vkBufferList.size() == 1)
+            {
+                update_buffer_kernel = kernel[0];
+            }
+            else if (vkBufferList.size() == 2)
+            {
+                update_buffer_kernel = kernel[1];
+            }
+            else if (vkBufferList.size() == 4)
+            {
+                update_buffer_kernel = kernel[2];
+            }
+            // global work size should be less than or equal to
+            // bufferSizeList[i]
+            global_work_size[0] = bufferSize;
+            for (uint32_t iter = 0; iter < maxIter; iter++)
+            {
+                if (iter == 0)
+                {
+                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                }
+                else
+                {
+                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                   vkVk2CLSemaphore);
+                }
+                clVk2CLExternalSemaphore->wait(cmd_queue1);
+                err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
+                                     (void *)&bufferSize);
+                for (int i = 0; i < vkBufferList.size(); i++)
+                {
+                    err |=
+                        clSetKernelArg(update_buffer_kernel, i + 1,
+                                       sizeof(cl_mem), (void *)&(buffers[i]));
+                }
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to set arg values for kernel\n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueNDRangeKernel(cmd_queue1, update_buffer_kernel,
+                                             1, NULL, global_work_size, NULL, 0,
+                                             NULL, NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err,
+                                "Error: Failed to launch update_buffer_kernel,"
+                                " error\n");
+                    goto CLEANUP;
+                }
+                if (iter != (maxIter - 1))
+                {
+                    clCl2VkExternalSemaphore->signal(cmd_queue1);
+                }
+            }
+            error_2 = (uint8_t *)malloc(sizeof(uint8_t));
+            if (NULL == error_2)
+            {
+                log_error("Not able to allocate memory\n");
+                goto CLEANUP;
+            }
+            error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                     sizeof(uint8_t), NULL, &err);
+            if (CL_SUCCESS != err)
+            {
+                print_error(err, "Error: clCreateBuffer \n");
+                goto CLEANUP;
+            }
+            uint8_t val = 0;
+            err = clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                       sizeof(uint8_t), &val, 0, NULL, NULL);
+            if (CL_SUCCESS != err)
+            {
+                print_error(err, "Error: clEnqueueWriteBuffer \n");
+                goto CLEANUP;
+            }
+            int calc_max_iter = (maxIter * 2);
+            for (int i = 0; i < vkBufferList.size(); i++)
+            {
+                err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem),
+                                     (void *)&(buffers[i]));
+                err |=
+                    clSetKernelArg(verify_kernel, 1, sizeof(int), &bufferSize);
+                err |= clSetKernelArg(verify_kernel, 2, sizeof(int),
+                                      &calc_max_iter);
+                err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem),
+                                      (void *)&error_1);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(
+                        err,
+                        "Error: Failed to set arg values for verify_kernel \n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, NULL,
+                                             global_work_size, NULL, 0, NULL,
+                                             NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(
+                        err, "Error: Failed to launch verify_kernel, error\n");
+                    goto CLEANUP;
+                }
+                err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                          sizeof(uint8_t), error_2, 0, NULL,
+                                          NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err, "Error: Failed read output, error  \n");
+                    goto CLEANUP;
+                }
+                if (*error_2 == 1)
+                {
+                    log_error("&&&& vulkan_opencl_buffer test FAILED\n");
+                    goto CLEANUP;
+                }
+            }
+            for (size_t i = 0; i < vkBufferList.size(); i++)
+            {
+                delete vkBufferListDeviceMemory[i];
+                delete externalMemory[i];
+            }
+            vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(),
+                                           vkBufferListDeviceMemory.begin()
+                                               + numBuffers);
+            externalMemory.erase(externalMemory.begin(),
+                                 externalMemory.begin() + numBuffers);
+        }
+    }
+    for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+    {
+        if (vkBufferListDeviceMemory[i])
+        {
+            delete vkBufferListDeviceMemory[i];
+        }
+        if (externalMemory[i])
+        {
+            delete externalMemory[i];
+        }
+    }
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (error_2) free(error_2);
+    if (error_1) clReleaseMemObject(error_1);
+    return err;
+int run_test_with_multi_import_same_ctx(
+    cl_context &context, cl_command_queue &cmd_queue1, cl_kernel *kernel,
+    cl_kernel &verify_kernel, VulkanDevice &vkDevice, uint32_t numBuffers,
+    uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    size_t global_work_size[1];
+    uint8_t *error_2;
+    cl_mem error_1;
+    int numImports = numBuffers;
+    cl_kernel update_buffer_kernel[MAX_IMPORTS];
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    int err = CL_SUCCESS;
+    int calc_max_iter;
+    bool withOffset;
+    uint32_t pBufferSize;
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    std::vector<char> vkBufferShader = readFile("buffer.spv");
+    VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                            vkBufferShaderModule);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    const uint32_t maxIter = innerIterations;
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory;
+    std::vector<std::vector<clExternalMemory *>> externalMemory;
+    for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size();
+         emhtIdx++)
+    {
+        VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+            vkExternalMemoryHandleTypeList[emhtIdx];
+        log_info("External memory handle type: %d\n",
+                 vkExternalMemoryHandleType);
+        VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024,
+                                   vkExternalMemoryHandleType);
+        const VulkanMemoryTypeList &memoryTypeList =
+            vkDummyBuffer.getMemoryTypeList();
+        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+        {
+            const VulkanMemoryType &memoryType = memoryTypeList[mtIdx];
+            log_info("Memory type index: %d\n", (uint32_t)memoryType);
+            log_info("Memory type property: %d\n",
+                     memoryType.getMemoryTypeProperty());
+            for (unsigned int withOffset = 0;
+                 withOffset <= (unsigned int)enableOffset; withOffset++)
+            {
+                log_info("Running withOffset case %d\n", (uint32_t)withOffset);
+                if (withOffset)
+                {
+                    pBufferSize = bufferSizeForOffset;
+                }
+                else
+                {
+                    pBufferSize = bufferSize;
+                }
+                cl_mem buffers[MAX_BUFFERS][MAX_IMPORTS];
+                VulkanBufferList vkBufferList(numBuffers, vkDevice, pBufferSize,
+                                              vkExternalMemoryHandleType);
+                uint32_t interBufferOffset =
+                    (uint32_t)(vkBufferList[0].getSize());
+                for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
+                {
+                    if (withOffset == 0)
+                    {
+                        vkBufferListDeviceMemory.push_back(
+                            new VulkanDeviceMemory(vkDevice, pBufferSize,
+                                                   memoryType,
+                                                   vkExternalMemoryHandleType));
+                    }
+                    if (withOffset == 1)
+                    {
+                        uint32_t totalSize =
+                            (uint32_t)(vkBufferList.size() * interBufferOffset);
+                        vkBufferListDeviceMemory.push_back(
+                            new VulkanDeviceMemory(vkDevice, totalSize,
+                                                   memoryType,
+                                                   vkExternalMemoryHandleType));
+                    }
+                    std::vector<clExternalMemory *> pExternalMemory;
+                    for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++)
+                    {
+                        pExternalMemory.push_back(new clExternalMemory(
+                            vkBufferListDeviceMemory[bIdx],
+                            vkExternalMemoryHandleType,
+                            withOffset * bIdx * interBufferOffset, pBufferSize,
+                            context, deviceId));
+                    }
+                    externalMemory.push_back(pExternalMemory);
+                }
+                clFinish(cmd_queue1);
+                Params *params = (Params *);
+                params->numBuffers = numBuffers;
+                params->bufferSize = pBufferSize;
+                params->interBufferOffset = interBufferOffset * withOffset;
+                vkParamsDeviceMemory.unmap();
+                vkDescriptorSet.update(0, vkParamsBuffer);
+                for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++)
+                {
+                    size_t buffer_size = vkBufferList[bIdx].getSize();
+                    vkBufferListDeviceMemory[bIdx]->bindBuffer(
+                        vkBufferList[bIdx],
+                        bIdx * interBufferOffset * withOffset);
+                    for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++)
+                    {
+                        buffers[bIdx][cl_bIdx] =
+                            externalMemory[bIdx][cl_bIdx]
+                                ->getExternalMemoryBuffer();
+                    }
+                    vkDescriptorSet.update((uint32_t)bIdx + 1,
+                                           vkBufferList[bIdx]);
+                }
+                vkCommandBuffer.begin();
+                vkCommandBuffer.bindPipeline(vkComputePipeline);
+                vkCommandBuffer.bindDescriptorSets(
+                    vkComputePipeline, vkPipelineLayout, vkDescriptorSet);
+                vkCommandBuffer.dispatch(512, 1, 1);
+                vkCommandBuffer.end();
+                for (int i = 0; i < numImports; i++)
+                {
+                    update_buffer_kernel[i] = (numBuffers == 1)
+                        ? kernel[0]
+                        : ((numBuffers == 2) ? kernel[1] : kernel[2]);
+                }
+                // global work size should be less than or equal to
+                // bufferSizeList[i]
+                global_work_size[0] = pBufferSize;
+                for (uint32_t iter = 0; iter < maxIter; iter++)
+                {
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
+                    for (uint8_t launchIter = 0; launchIter < numImports;
+                         launchIter++)
+                    {
+                        err = clSetKernelArg(update_buffer_kernel[launchIter],
+                                             0, sizeof(uint32_t),
+                                             (void *)&pBufferSize);
+                        for (int i = 0; i < numBuffers; i++)
+                        {
+                            err |= clSetKernelArg(
+                                update_buffer_kernel[launchIter], i + 1,
+                                sizeof(cl_mem),
+                                (void *)&(buffers[i][launchIter]));
+                        }
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to set arg values for "
+                                        "kernel\n ");
+                            goto CLEANUP;
+                        }
+                        err = clEnqueueNDRangeKernel(
+                            cmd_queue1, update_buffer_kernel[launchIter], 1,
+                            NULL, global_work_size, NULL, 0, NULL, NULL);
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to launch "
+                                        "update_buffer_kernel, error\n ");
+                            goto CLEANUP;
+                        }
+                    }
+                    if (iter != (maxIter - 1))
+                    {
+                        clCl2VkExternalSemaphore->signal(cmd_queue1);
+                    }
+                }
+                error_2 = (uint8_t *)malloc(sizeof(uint8_t));
+                if (NULL == error_2)
+                {
+                    log_error("Not able to allocate memory\n");
+                    goto CLEANUP;
+                }
+                error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                         sizeof(uint8_t), NULL, &err);
+                if (CL_SUCCESS != err)
+                {
+                    print_error(err, "Error: clCreateBuffer \n");
+                    goto CLEANUP;
+                }
+                uint8_t val = 0;
+                err =
+                    clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                         sizeof(uint8_t), &val, 0, NULL, NULL);
+                if (CL_SUCCESS != err)
+                {
+                    print_error(err, "Error: clEnqueueWriteBuffer \n");
+                    goto CLEANUP;
+                }
+                calc_max_iter = maxIter * (numBuffers + 1);
+                for (int i = 0; i < vkBufferList.size(); i++)
+                {
+                    err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem),
+                                         (void *)&(buffers[i][0]));
+                    err |= clSetKernelArg(verify_kernel, 1, sizeof(int),
+                                          &pBufferSize);
+                    err |= clSetKernelArg(verify_kernel, 2, sizeof(int),
+                                          &calc_max_iter);
+                    err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem),
+                                          (void *)&error_1);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err,
+                                    "Error: Failed to set arg values for "
+                                    "verify_kernel \n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1,
+                                                 NULL, global_work_size, NULL,
+                                                 0, NULL, NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(
+                            err,
+                            "Error: Failed to launch verify_kernel, error\n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                              sizeof(uint8_t), error_2, 0, NULL,
+                                              NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err, "Error: Failed read output, error \n");
+                        goto CLEANUP;
+                    }
+                    if (*error_2 == 1)
+                    {
+                        log_error("&&&& vulkan_opencl_buffer test FAILED\n");
+                        goto CLEANUP;
+                    }
+                }
+                for (size_t i = 0; i < vkBufferList.size(); i++)
+                {
+                    for (size_t j = 0; j < numImports; j++)
+                    {
+                        delete externalMemory[i][j];
+                    }
+                }
+                for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+                {
+                    delete vkBufferListDeviceMemory[i];
+                }
+                vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(),
+                                               vkBufferListDeviceMemory.end());
+                for (size_t i = 0; i < externalMemory.size(); i++)
+                {
+                    externalMemory[i].erase(externalMemory[i].begin(),
+                                            externalMemory[i].begin()
+                                                + numBuffers);
+                }
+                externalMemory.clear();
+            }
+        }
+    }
+    for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+    {
+        if (vkBufferListDeviceMemory[i])
+        {
+            delete vkBufferListDeviceMemory[i];
+        }
+    }
+    for (size_t i = 0; i < externalMemory.size(); i++)
+    {
+        for (size_t j = 0; j < externalMemory[i].size(); j++)
+        {
+            if (externalMemory[i][j])
+            {
+                delete externalMemory[i][j];
+            }
+        }
+    }
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (error_2) free(error_2);
+    if (error_1) clReleaseMemObject(error_1);
+    return err;
+int run_test_with_multi_import_diff_ctx(
+    cl_context &context, cl_context &context2, cl_command_queue &cmd_queue1,
+    cl_command_queue &cmd_queue2, cl_kernel *kernel1, cl_kernel *kernel2,
+    cl_kernel &verify_kernel, cl_kernel verify_kernel2, VulkanDevice &vkDevice,
+    uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    size_t global_work_size[1];
+    uint8_t *error_3;
+    cl_mem error_1;
+    cl_mem error_2;
+    int numImports = numBuffers;
+    cl_kernel update_buffer_kernel1[MAX_IMPORTS];
+    cl_kernel update_buffer_kernel2[MAX_IMPORTS];
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    clExternalSemaphore *clVk2CLExternalSemaphore2 = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore2 = NULL;
+    int err = CL_SUCCESS;
+    int calc_max_iter;
+    bool withOffset;
+    uint32_t pBufferSize;
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    std::vector<char> vkBufferShader = readFile("buffer.spv");
+    VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                            vkBufferShaderModule);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clVk2CLExternalSemaphore2 = new clExternalSemaphore(
+        vkVk2CLSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore2 = new clExternalSemaphore(
+        vkCl2VkSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
+    const uint32_t maxIter = innerIterations;
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory;
+    std::vector<std::vector<clExternalMemory *>> externalMemory1;
+    std::vector<std::vector<clExternalMemory *>> externalMemory2;
+    for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size();
+         emhtIdx++)
+    {
+        VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
+            vkExternalMemoryHandleTypeList[emhtIdx];
+        log_info("External memory handle type:%d\n",
+                 vkExternalMemoryHandleType);
+        VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024,
+                                   vkExternalMemoryHandleType);
+        const VulkanMemoryTypeList &memoryTypeList =
+            vkDummyBuffer.getMemoryTypeList();
+        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+        {
+            const VulkanMemoryType &memoryType = memoryTypeList[mtIdx];
+            log_info("Memory type index: %d\n", (uint32_t)memoryType);
+            log_info("Memory type property: %d\n",
+                     memoryType.getMemoryTypeProperty());
+            for (unsigned int withOffset = 0;
+                 withOffset <= (unsigned int)enableOffset; withOffset++)
+            {
+                log_info("Running withOffset case %d\n", (uint32_t)withOffset);
+                cl_mem buffers1[MAX_BUFFERS][MAX_IMPORTS];
+                cl_mem buffers2[MAX_BUFFERS][MAX_IMPORTS];
+                if (withOffset)
+                {
+                    pBufferSize = bufferSizeForOffset;
+                }
+                else
+                {
+                    pBufferSize = bufferSize;
+                }
+                VulkanBufferList vkBufferList(numBuffers, vkDevice, pBufferSize,
+                                              vkExternalMemoryHandleType);
+                uint32_t interBufferOffset =
+                    (uint32_t)(vkBufferList[0].getSize());
+                for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
+                {
+                    if (withOffset == 0)
+                    {
+                        vkBufferListDeviceMemory.push_back(
+                            new VulkanDeviceMemory(vkDevice, pBufferSize,
+                                                   memoryType,
+                                                   vkExternalMemoryHandleType));
+                    }
+                    if (withOffset == 1)
+                    {
+                        uint32_t totalSize =
+                            (uint32_t)(vkBufferList.size() * interBufferOffset);
+                        vkBufferListDeviceMemory.push_back(
+                            new VulkanDeviceMemory(vkDevice, totalSize,
+                                                   memoryType,
+                                                   vkExternalMemoryHandleType));
+                    }
+                    std::vector<clExternalMemory *> pExternalMemory1;
+                    std::vector<clExternalMemory *> pExternalMemory2;
+                    for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++)
+                    {
+                        pExternalMemory1.push_back(new clExternalMemory(
+                            vkBufferListDeviceMemory[bIdx],
+                            vkExternalMemoryHandleType,
+                            withOffset * bIdx * interBufferOffset, pBufferSize,
+                            context, deviceId));
+                        pExternalMemory2.push_back(new clExternalMemory(
+                            vkBufferListDeviceMemory[bIdx],
+                            vkExternalMemoryHandleType,
+                            withOffset * bIdx * interBufferOffset, pBufferSize,
+                            context2, deviceId));
+                    }
+                    externalMemory1.push_back(pExternalMemory1);
+                    externalMemory2.push_back(pExternalMemory2);
+                }
+                clFinish(cmd_queue1);
+                Params *params = (Params *);
+                params->numBuffers = numBuffers;
+                params->bufferSize = pBufferSize;
+                params->interBufferOffset = interBufferOffset * withOffset;
+                vkParamsDeviceMemory.unmap();
+                vkDescriptorSet.update(0, vkParamsBuffer);
+                for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++)
+                {
+                    size_t buffer_size = vkBufferList[bIdx].getSize();
+                    vkBufferListDeviceMemory[bIdx]->bindBuffer(
+                        vkBufferList[bIdx],
+                        bIdx * interBufferOffset * withOffset);
+                    for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++)
+                    {
+                        buffers1[bIdx][cl_bIdx] =
+                            externalMemory1[bIdx][cl_bIdx]
+                                ->getExternalMemoryBuffer();
+                        buffers2[bIdx][cl_bIdx] =
+                            externalMemory2[bIdx][cl_bIdx]
+                                ->getExternalMemoryBuffer();
+                    }
+                    vkDescriptorSet.update((uint32_t)bIdx + 1,
+                                           vkBufferList[bIdx]);
+                }
+                vkCommandBuffer.begin();
+                vkCommandBuffer.bindPipeline(vkComputePipeline);
+                vkCommandBuffer.bindDescriptorSets(
+                    vkComputePipeline, vkPipelineLayout, vkDescriptorSet);
+                vkCommandBuffer.dispatch(512, 1, 1);
+                vkCommandBuffer.end();
+                for (int i = 0; i < numImports; i++)
+                {
+                    update_buffer_kernel1[i] = (numBuffers == 1)
+                        ? kernel1[0]
+                        : ((numBuffers == 2) ? kernel1[1] : kernel1[2]);
+                    update_buffer_kernel2[i] = (numBuffers == 1)
+                        ? kernel2[0]
+                        : ((numBuffers == 2) ? kernel2[1] : kernel2[2]);
+                }
+                // global work size should be less than or equal
+                // to bufferSizeList[i]
+                global_work_size[0] = pBufferSize;
+                for (uint32_t iter = 0; iter < maxIter; iter++)
+                {
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
+                    for (uint8_t launchIter = 0; launchIter < numImports;
+                         launchIter++)
+                    {
+                        err = clSetKernelArg(update_buffer_kernel1[launchIter],
+                                             0, sizeof(uint32_t),
+                                             (void *)&pBufferSize);
+                        for (int i = 0; i < numBuffers; i++)
+                        {
+                            err |= clSetKernelArg(
+                                update_buffer_kernel1[launchIter], i + 1,
+                                sizeof(cl_mem),
+                                (void *)&(buffers1[i][launchIter]));
+                        }
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to set arg values for "
+                                        "kernel\n ");
+                            goto CLEANUP;
+                        }
+                        err = clEnqueueNDRangeKernel(
+                            cmd_queue1, update_buffer_kernel1[launchIter], 1,
+                            NULL, global_work_size, NULL, 0, NULL, NULL);
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to launch "
+                                        "update_buffer_kernel, error\n");
+                            goto CLEANUP;
+                        }
+                    }
+                    if (iter != (maxIter - 1))
+                    {
+                        clCl2VkExternalSemaphore->signal(cmd_queue1);
+                    }
+                }
+                clFinish(cmd_queue1);
+                for (uint32_t iter = 0; iter < maxIter; iter++)
+                {
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+                    clVk2CLExternalSemaphore2->wait(cmd_queue2);
+                    for (uint8_t launchIter = 0; launchIter < numImports;
+                         launchIter++)
+                    {
+                        err = clSetKernelArg(update_buffer_kernel2[launchIter],
+                                             0, sizeof(uint32_t),
+                                             (void *)&bufferSize);
+                        for (int i = 0; i < numBuffers; i++)
+                        {
+                            err |= clSetKernelArg(
+                                update_buffer_kernel2[launchIter], i + 1,
+                                sizeof(cl_mem),
+                                (void *)&(buffers2[i][launchIter]));
+                        }
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to set arg values for "
+                                        "kernel\n ");
+                            goto CLEANUP;
+                        }
+                        err = clEnqueueNDRangeKernel(
+                            cmd_queue2, update_buffer_kernel2[launchIter], 1,
+                            NULL, global_work_size, NULL, 0, NULL, NULL);
+                        if (err != CL_SUCCESS)
+                        {
+                            print_error(err,
+                                        "Error: Failed to launch "
+                                        "update_buffer_kernel, error\n ");
+                            goto CLEANUP;
+                        }
+                    }
+                    if (iter != (maxIter - 1))
+                    {
+                        clCl2VkExternalSemaphore2->signal(cmd_queue2);
+                    }
+                }
+                clFinish(cmd_queue2);
+                error_3 = (uint8_t *)malloc(sizeof(uint8_t));
+                if (NULL == error_3)
+                {
+                    log_error("Not able to allocate memory\n");
+                    goto CLEANUP;
+                }
+                error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                         sizeof(uint8_t), NULL, &err);
+                if (CL_SUCCESS != err)
+                {
+                    print_error(err, "Error: clCreateBuffer \n");
+                    goto CLEANUP;
+                }
+                error_2 = clCreateBuffer(context2, CL_MEM_WRITE_ONLY,
+                                         sizeof(uint8_t), NULL, &err);
+                if (CL_SUCCESS != err)
+                {
+                    print_error(err, "Error: clCreateBuffer \n");
+                    goto CLEANUP;
+                }
+                uint8_t val = 0;
+                err =
+                    clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                         sizeof(uint8_t), &val, 0, NULL, NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err, "Error: Failed read output, error  \n");
+                    goto CLEANUP;
+                }
+                err =
+                    clEnqueueWriteBuffer(cmd_queue2, error_2, CL_TRUE, 0,
+                                         sizeof(uint8_t), &val, 0, NULL, NULL);
+                if (err != CL_SUCCESS)
+                {
+                    print_error(err, "Error: Failed read output, error  \n");
+                    goto CLEANUP;
+                }
+                calc_max_iter = maxIter * 2 * (numBuffers + 1);
+                for (int i = 0; i < numBuffers; i++)
+                {
+                    err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem),
+                                         (void *)&(buffers1[i][0]));
+                    err |= clSetKernelArg(verify_kernel, 1, sizeof(int),
+                                          &pBufferSize);
+                    err |= clSetKernelArg(verify_kernel, 2, sizeof(int),
+                                          &calc_max_iter);
+                    err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem),
+                                          (void *)&error_1);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err,
+                                    "Error: Failed to set arg values for "
+                                    "verify_kernel \n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1,
+                                                 NULL, global_work_size, NULL,
+                                                 0, NULL, NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err,
+                                    "Error: Failed to launch verify_kernel,"
+                                    "error\n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0,
+                                              sizeof(uint8_t), error_3, 0, NULL,
+                                              NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err, "Error: Failed read output, error\n");
+                        goto CLEANUP;
+                    }
+                    if (*error_3 == 1)
+                    {
+                        log_error("&&&& vulkan_opencl_buffer test FAILED\n");
+                        goto CLEANUP;
+                    }
+                }
+                *error_3 = 0;
+                for (int i = 0; i < vkBufferList.size(); i++)
+                {
+                    err = clSetKernelArg(verify_kernel2, 0, sizeof(cl_mem),
+                                         (void *)&(buffers2[i][0]));
+                    err |= clSetKernelArg(verify_kernel2, 1, sizeof(int),
+                                          &pBufferSize);
+                    err |= clSetKernelArg(verify_kernel2, 2, sizeof(int),
+                                          &calc_max_iter);
+                    err |= clSetKernelArg(verify_kernel2, 3, sizeof(cl_mem),
+                                          (void *)&error_2);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err,
+                                    "Error: Failed to set arg values for "
+                                    "verify_kernel \n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueNDRangeKernel(cmd_queue2, verify_kernel2, 1,
+                                                 NULL, global_work_size, NULL,
+                                                 0, NULL, NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err,
+                                    "Error: Failed to launch verify_kernel,"
+                                    "error\n");
+                        goto CLEANUP;
+                    }
+                    err = clEnqueueReadBuffer(cmd_queue2, error_2, CL_TRUE, 0,
+                                              sizeof(uint8_t), error_3, 0, NULL,
+                                              NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        print_error(err, "Error: Failed read output, error\n");
+                        goto CLEANUP;
+                    }
+                    if (*error_3 == 1)
+                    {
+                        log_error("&&&& vulkan_opencl_buffer test FAILED\n");
+                        goto CLEANUP;
+                    }
+                }
+                for (size_t i = 0; i < vkBufferList.size(); i++)
+                {
+                    for (size_t j = 0; j < numImports; j++)
+                    {
+                        delete externalMemory1[i][j];
+                        delete externalMemory2[i][j];
+                    }
+                }
+                for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+                {
+                    delete vkBufferListDeviceMemory[i];
+                }
+                vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(),
+                                               vkBufferListDeviceMemory.end());
+                for (size_t i = 0; i < externalMemory1.size(); i++)
+                {
+                    externalMemory1[i].erase(externalMemory1[i].begin(),
+                                             externalMemory1[i].begin()
+                                                 + numBuffers);
+                    externalMemory2[i].erase(externalMemory2[i].begin(),
+                                             externalMemory2[i].begin()
+                                                 + numBuffers);
+                }
+                externalMemory1.clear();
+                externalMemory2.clear();
+            }
+        }
+    }
+    for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++)
+    {
+        if (vkBufferListDeviceMemory[i])
+        {
+            delete vkBufferListDeviceMemory[i];
+        }
+    }
+    for (size_t i = 0; i < externalMemory1.size(); i++)
+    {
+        for (size_t j = 0; j < externalMemory1[i].size(); j++)
+        {
+            if (externalMemory1[i][j])
+            {
+                delete externalMemory1[i][j];
+            }
+        }
+    }
+    for (size_t i = 0; i < externalMemory2.size(); i++)
+    {
+        for (size_t j = 0; j < externalMemory2[i].size(); j++)
+        {
+            if (externalMemory2[i][j])
+            {
+                delete externalMemory2[i][j];
+            }
+        }
+    }
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2;
+    if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2;
+    if (error_3) free(error_3);
+    if (error_1) clReleaseMemObject(error_1);
+    if (error_2) clReleaseMemObject(error_2);
+    return err;
+int test_buffer_common(cl_device_id device_, cl_context context_,
+                       cl_command_queue queue_, int numElements_)
+    int current_device = 0;
+    int device_count = 0;
+    int devices_prohibited = 0;
+    cl_int errNum = CL_SUCCESS;
+    cl_platform_id platform = NULL;
+    size_t extensionSize = 0;
+    cl_uint num_devices = 0;
+    cl_uint device_no = 0;
+    const size_t bufsize = BUFFERSIZE;
+    char buf[BUFFERSIZE];
+    cl_device_id *devices;
+    char *extensions = NULL;
+    cl_kernel verify_kernel;
+    cl_kernel verify_kernel2;
+    cl_kernel kernel[3] = { NULL, NULL, NULL };
+    cl_kernel kernel2[3] = { NULL, NULL, NULL };
+    const char *program_source_const[3] = { kernel_text_numbuffer_1,
+                                            kernel_text_numbuffer_2,
+                                            kernel_text_numbuffer_4 };
+    const char *program_source_const_verify;
+    size_t program_source_length;
+    cl_command_queue cmd_queue1 = NULL;
+    cl_command_queue cmd_queue2 = NULL;
+    cl_command_queue cmd_queue3 = NULL;
+    cl_context context = NULL;
+    cl_program program[3] = { NULL, NULL, NULL };
+    cl_program program_verify, program_verify2;
+    cl_context context2 = NULL;
+    VulkanDevice vkDevice;
+    uint32_t numBuffersList[] = { 1, 2, 4 };
+    uint32_t bufferSizeList[] = { 4 * 1024, 64 * 1024, 2 * 1024 * 1024 };
+    uint32_t bufferSizeListforOffset[] = { 256, 512, 1024 };
+    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    if (errNum != CL_SUCCESS)
+    {
+        print_error(errNum, "Error: Failed to get platform\n");
+        goto CLEANUP;
+    }
+    errNum =
+        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "clGetDeviceIDs failed in returning of devices\n");
+        goto CLEANUP;
+    }
+    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+    if (NULL == devices)
+    {
+        errNum = CL_OUT_OF_HOST_MEMORY;
+        print_error(errNum, "Unable to allocate memory for devices\n");
+        goto CLEANUP;
+    }
+    errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices,
+                            NULL);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "Failed to get deviceID.\n");
+        goto CLEANUP;
+    }
+    contextProperties[1] = (cl_context_properties)platform;
+    log_info("Assigned contextproperties for platform\n");
+    for (device_no = 0; device_no < num_devices; device_no++)
+    {
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0,
+                                 NULL, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum,
+                        "Error in clGetDeviceInfo for getting device_extension "
+                        "size....\n");
+            goto CLEANUP;
+        }
+        extensions = (char *)malloc(extensionSize);
+        if (NULL == extensions)
+        {
+            print_error(errNum, "Unable to allocate memory for extensions\n");
+            errNum = CL_OUT_OF_HOST_MEMORY;
+            goto CLEANUP;
+        }
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS,
+                                 extensionSize, extensions, NULL);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum,
+                        "Error in clGetDeviceInfo for device_extension\n");
+            goto CLEANUP;
+        }
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR,
+                                 CL_UUID_SIZE_KHR, uuid, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum, "clGetDeviceInfo failed\n");
+            goto CLEANUP;
+        }
+        errNum =
+            memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE);
+        if (errNum == 0)
+        {
+            break;
+        }
+    }
+    if (device_no >= num_devices)
+    {
+        errNum = EXIT_FAILURE;
+        print_error(errNum,
+                    "OpenCL error: "
+                    "No Vulkan-OpenCL Interop capable GPU found.\n");
+        goto CLEANUP;
+    }
+    deviceId = devices[device_no];
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &errNum);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "error creating context\n");
+        goto CLEANUP;
+    }
+    log_info("Successfully created context !!!\n");
+    cmd_queue1 = clCreateCommandQueue(context, devices[device_no], 0, &errNum);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "Error: Failed to create command queue!\n");
+        goto CLEANUP;
+    }
+    cmd_queue2 = clCreateCommandQueue(context, devices[device_no], 0, &errNum);
+    if (CL_SUCCESS != errNum)
+    {
+        print_error(errNum, "Error: Failed to create command queue!\n");
+        goto CLEANUP;
+    }
+    log_info("clCreateCommandQueue successful\n");
+    for (int i = 0; i < 3; i++)
+    {
+        program_source_length = strlen(program_source_const[i]);
+        program[i] =
+            clCreateProgramWithSource(context, 1, &program_source_const[i],
+                                      &program_source_length, &errNum);
+        errNum = clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL);
+        if (errNum != CL_SUCCESS)
+        {
+            print_error(errNum, "Error: Failed to build program \n");
+            return errNum;
+        }
+        // create the kernel
+        kernel[i] = clCreateKernel(program[i], "clUpdateBuffer", &errNum);
+        if (errNum != CL_SUCCESS)
+        {
+            print_error(errNum, "clCreateKernel failed \n");
+            return errNum;
+        }
+    }
+    program_source_const_verify = kernel_text_verify;
+    program_source_length = strlen(program_source_const_verify);
+    program_verify =
+        clCreateProgramWithSource(context, 1, &program_source_const_verify,
+                                  &program_source_length, &errNum);
+    errNum = clBuildProgram(program_verify, 0, NULL, NULL, NULL, NULL);
+    if (errNum != CL_SUCCESS)
+    {
+        log_error("Error: Failed to build program2\n");
+        return errNum;
+    }
+    verify_kernel = clCreateKernel(program_verify, "checkKernel", &errNum);
+    if (errNum != CL_SUCCESS)
+    {
+        print_error(errNum, "clCreateKernel failed \n");
+        return errNum;
+    }
+    if (multiCtx) // different context guard
+    {
+        context2 = clCreateContextFromType(
+            contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errNum);
+        if (CL_SUCCESS != errNum)
+        {
+            print_error(errNum, "error creating context\n");
+            goto CLEANUP;
+        }
+        cmd_queue3 =
+            clCreateCommandQueue(context2, devices[device_no], 0, &errNum);
+        if (CL_SUCCESS != errNum)
+        {
+            errNum = CL_INVALID_COMMAND_QUEUE;
+            print_error(errNum, "Error: Failed to create command queue!\n");
+            goto CLEANUP;
+        }
+        for (int i = 0; i < 3; i++)
+        {
+            program_source_length = strlen(program_source_const[i]);
+            program[i] =
+                clCreateProgramWithSource(context2, 1, &program_source_const[i],
+                                          &program_source_length, &errNum);
+            errNum = clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL);
+            if (errNum != CL_SUCCESS)
+            {
+                print_error(errNum, "Error: Failed to build program \n");
+                return errNum;
+            }
+            // create the kernel
+            kernel2[i] = clCreateKernel(program[i], "clUpdateBuffer", &errNum);
+            if (errNum != CL_SUCCESS)
+            {
+                print_error(errNum, "clCreateKernel failed \n");
+                return errNum;
+            }
+        }
+        program_source_length = strlen(program_source_const_verify);
+        program_verify =
+            clCreateProgramWithSource(context2, 1, &program_source_const_verify,
+                                      &program_source_length, &errNum);
+        errNum = clBuildProgram(program_verify, 0, NULL, NULL, NULL, NULL);
+        if (errNum != CL_SUCCESS)
+        {
+            log_error("Error: Failed to build program2\n");
+            return errNum;
+        }
+        verify_kernel2 = clCreateKernel(program_verify, "checkKernel", &errNum);
+        if (errNum != CL_SUCCESS)
+        {
+            print_error(errNum, "clCreateKernel failed \n");
+            return errNum;
+        }
+    }
+    for (size_t numBuffersIdx = 0; numBuffersIdx < ARRAY_SIZE(numBuffersList);
+         numBuffersIdx++)
+    {
+        uint32_t numBuffers = numBuffersList[numBuffersIdx];
+        log_info("Number of buffers: %d\n", numBuffers);
+        for (size_t sizeIdx = 0; sizeIdx < ARRAY_SIZE(bufferSizeList);
+             sizeIdx++)
+        {
+            uint32_t bufferSize = bufferSizeList[sizeIdx];
+            uint32_t bufferSizeForOffset = bufferSizeListforOffset[sizeIdx];
+            log_info("&&&& RUNNING vulkan_opencl_buffer test for Buffer size: "
+                     "%d\n",
+                     bufferSize);
+            if (multiImport && !multiCtx)
+            {
+                errNum = run_test_with_multi_import_same_ctx(
+                    context, cmd_queue1, kernel, verify_kernel, vkDevice,
+                    numBuffers, bufferSize, bufferSizeForOffset);
+            }
+            else if (multiImport && multiCtx)
+            {
+                errNum = run_test_with_multi_import_diff_ctx(
+                    context, context2, cmd_queue1, cmd_queue3, kernel, kernel2,
+                    verify_kernel, verify_kernel2, vkDevice, numBuffers,
+                    bufferSize, bufferSizeForOffset);
+            }
+            else if (numCQ == 2)
+            {
+                errNum = run_test_with_two_queue(
+                    context, cmd_queue1, cmd_queue2, kernel, verify_kernel,
+                    vkDevice, numBuffers + 1, bufferSize);
+            }
+            else
+            {
+                errNum = run_test_with_one_queue(context, cmd_queue1, kernel,
+                                                 verify_kernel, vkDevice,
+                                                 numBuffers, bufferSize);
+            }
+            if (errNum != CL_SUCCESS)
+            {
+                print_error(errNum, "func_name failed \n");
+                goto CLEANUP;
+            }
+        }
+    }
+    for (int i = 0; i < 3; i++)
+    {
+        if (program[i]) clReleaseProgram(program[i]);
+        if (kernel[i]) clReleaseKernel(kernel[i]);
+    }
+    if (cmd_queue1) clReleaseCommandQueue(cmd_queue1);
+    if (cmd_queue2) clReleaseCommandQueue(cmd_queue2);
+    if (cmd_queue3) clReleaseCommandQueue(cmd_queue3);
+    if (context) clReleaseContext(context);
+    if (context2) clReleaseContext(context2);
+    if (devices) free(devices);
+    if (extensions) free(extensions);
+    return errNum;
diff --git a/test_conformance/vulkan/test_vulkan_interop_image.cpp b/test_conformance/vulkan/test_vulkan_interop_image.cpp
new file mode 100644
index 0000000..7577de0
--- /dev/null
+++ b/test_conformance/vulkan/test_vulkan_interop_image.cpp
@@ -0,0 +1,1596 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define NOMINMAX
+#include <vulkan_interop_common.hpp>
+#include <string>
+#include "harness/errorHelpers.h"
+#define MAX_2D_IMAGES 5
+#define MAX_2D_IMAGE_WIDTH 1024
+#define MAX_2D_IMAGE_HEIGHT 1024
+#define NUM_BLOCKS(size, blockSize)                                            \
+    (ROUND_UP((size), (blockSize)) / (blockSize))
+#define ASSERT(x)                                                              \
+    if (!(x))                                                                  \
+    {                                                                          \
+        fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__,    \
+                __LINE__);                                                     \
+        exit(1);                                                               \
+    }
+#define ASSERT_LEQ(x, y)                                                       \
+    if (x > y)                                                                 \
+    {                                                                          \
+        ASSERT(0);                                                             \
+    }
+namespace {
+struct Params
+    uint32_t numImage2DDescriptors;
+static cl_uchar uuid[CL_UUID_SIZE_KHR];
+static cl_device_id deviceId = NULL;
+size_t max_width = MAX_2D_IMAGE_WIDTH;
+size_t max_height = MAX_2D_IMAGE_HEIGHT;
+const char *kernel_text_numImage_1 = " \
+__kernel void image2DKernel(read_only image2d_t InputImage, write_only image2d_t OutImage, int num2DImages, int baseWidth, int baseHeight, int numMipLevels)\n\
+    int threadIdxX = get_global_id(0);\n\
+    int threadIdxY = get_global_id(1);\n\
+    int numThreadsX = get_global_size(0);                                                                                                  \n\
+    int numThreadsY = get_global_size(1);\n\
+    if (threadIdxX >= baseWidth || threadIdxY >= baseHeight)\n\
+    {\n\
+        return;\n\
+    }\n\
+    %s dataA =  read_image%s(InputImage, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataB =  read_image%s(InputImage, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    write_image%s(OutImage, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\
+    write_image%s(OutImage, (int2)( threadIdxX, threadIdxY), dataB);\n\
+const char *kernel_text_numImage_2 = " \
+__kernel void image2DKernel(read_only image2d_t InputImage_1, write_only image2d_t OutImage_1, read_only image2d_t InputImage_2,write_only image2d_t OutImage_2,int num2DImages, int baseWidth, int baseHeight, int numMipLevels)    \n\
+    int threadIdxX = get_global_id(0);\n\
+    int threadIdxY = get_global_id(1);\n\
+    int numThreadsX = get_global_size(0);\n\
+    int numThreadsY = get_global_size(1);\n\
+    if (threadIdxX >= baseWidth || threadIdxY >= baseHeight) \n\
+    {\n\
+        return;\n\
+    }\n\
+    %s dataA =  read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataB =  read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    %s dataC =  read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataD =  read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    write_image%s(OutImage_1, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\
+    write_image%s(OutImage_1, (int2)(threadIdxX, threadIdxY), dataB);\n\
+    write_image%s(OutImage_2, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataC);\n\
+    write_image%s(OutImage_2, (int2)(threadIdxX, threadIdxY), dataD);\n\
+const char *kernel_text_numImage_4 = " \
+__kernel void image2DKernel(read_only image2d_t InputImage_1, write_only image2d_t OutImage_1, read_only image2d_t InputImage_2, write_only image2d_t OutImage_2, read_only image2d_t InputImage_3, write_only image2d_t OutImage_3, read_only image2d_t InputImage_4, write_only image2d_t OutImage_4, int num2DImages, int baseWidth, int baseHeight, int numMipLevels)    \n\
+    int threadIdxX = get_global_id(0);\n\
+    int threadIdxY = get_global_id(1);\n\
+    int numThreadsX = get_global_size(0);\n\
+    int numThreadsY = get_global_size(1);\n\
+    if (threadIdxX >= baseWidth || threadIdxY >= baseHeight) \n\
+    {\n\
+        return;\n\
+    }\n\
+    %s dataA =  read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataB =  read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    %s dataC =  read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataD =  read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    %s dataE =  read_image%s(InputImage_3, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataF =  read_image%s(InputImage_3, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    %s dataG =  read_image%s(InputImage_4, smpImg, (int2)(threadIdxX, threadIdxY)); \n\
+    %s dataH =  read_image%s(InputImage_4, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\
+    write_image%s(OutImage_1, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\
+    write_image%s(OutImage_1, (int2)(threadIdxX, threadIdxY), dataB);\n\
+    write_image%s(OutImage_2, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataC);\n\
+    write_image%s(OutImage_2, (int2)(threadIdxX, threadIdxY), dataD);\n\
+    write_image%s(OutImage_3, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataE);\n\
+    write_image%s(OutImage_3, (int2)(threadIdxX, threadIdxY), dataF);\n\
+    write_image%s(OutImage_4, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataG);\n\
+    write_image%s(OutImage_4, (int2)(threadIdxX, threadIdxY), dataH);\n\
+const uint32_t num2DImagesList[] = { 1, 2, 4 };
+const uint32_t widthList[] = { 4, 64, 183, 1024 };
+const uint32_t heightList[] = { 4, 64, 365 };
+const cl_kernel getKernelType(VulkanFormat format, cl_kernel kernel_float,
+                              cl_kernel kernel_signed,
+                              cl_kernel kernel_unsigned)
+    cl_kernel kernel;
+    switch (format)
+    {
+        case VULKAN_FORMAT_R32G32B32A32_SFLOAT: kernel = kernel_float; break;
+        case VULKAN_FORMAT_R32G32B32A32_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R32G32B32A32_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R16G16B16A16_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R16G16B16A16_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R8G8B8A8_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R8G8B8A8_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R32G32_SFLOAT: kernel = kernel_float; break;
+        case VULKAN_FORMAT_R32G32_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R32G32_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R16G16_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R16G16_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R8G8_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R8G8_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R32_SFLOAT: kernel = kernel_float; break;
+        case VULKAN_FORMAT_R32_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R32_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R16_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R16_SINT: kernel = kernel_signed; break;
+        case VULKAN_FORMAT_R8_UINT: kernel = kernel_unsigned; break;
+        case VULKAN_FORMAT_R8_SINT: kernel = kernel_signed; break;
+        default:
+            log_error(" Unsupported format");
+            ASSERT(0);
+            break;
+    }
+    return kernel;
+int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
+                            cl_command_queue &cmd_queue2,
+                            cl_kernel *kernel_unsigned,
+                            cl_kernel *kernel_signed, cl_kernel *kernel_float,
+                            VulkanDevice &vkDevice)
+    cl_int err = CL_SUCCESS;
+    size_t origin[3] = { 0, 0, 0 };
+    size_t region[3] = { 1, 1, 1 };
+    cl_kernel updateKernelCQ1, updateKernelCQ2;
+    std::vector<VulkanFormat> vkFormatList = getSupportedVulkanFormatList();
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    char magicValue = 0;
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    uint64_t maxImage2DSize =
+        max_width * max_height * MAX_2D_IMAGE_ELEMENT_SIZE * 2;
+    VulkanBuffer vkSrcBuffer(vkDevice, maxImage2DSize);
+    VulkanDeviceMemory vkSrcBufferDeviceMemory(
+        vkDevice, vkSrcBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkSrcBufferDeviceMemory.bindBuffer(vkSrcBuffer);
+    char *srcBufferPtr, *dstBufferPtr;
+    srcBufferPtr = (char *)malloc(maxImage2DSize);
+    dstBufferPtr = (char *)malloc(maxImage2DSize);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCopyCommandBuffer(vkDevice, vkCommandPool);
+    VulkanCommandBuffer vkShaderCommandBuffer(vkDevice, vkCommandPool);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1;
+    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2;
+    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1;
+    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2;
+    std::vector<char> vkImage2DShader;
+    for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++)
+    {
+        VulkanFormat vkFormat = vkFormatList[fIdx];
+        log_info("Format: %d\n", vkFormat);
+        uint32_t elementSize = getVulkanFormatElementSize(vkFormat);
+        ASSERT_LEQ(elementSize, (uint32_t)MAX_2D_IMAGE_ELEMENT_SIZE);
+        log_info("elementSize= %d\n", elementSize);
+        std::string fileName = "image2D_"
+            + std::string(getVulkanFormatGLSLFormat(vkFormat)) + ".spv";
+        log_info("Load %s file", fileName.c_str());
+        vkImage2DShader = readFile(fileName);
+        VulkanShaderModule vkImage2DShaderModule(vkDevice, vkImage2DShader);
+        VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                                vkImage2DShaderModule);
+        for (size_t wIdx = 0; wIdx < ARRAY_SIZE(widthList); wIdx++)
+        {
+            uint32_t width = widthList[wIdx];
+            log_info("Width: %d\n", width);
+            if (width > max_width) continue;
+            region[0] = width;
+            for (size_t hIdx = 0; hIdx < ARRAY_SIZE(heightList); hIdx++)
+            {
+                uint32_t height = heightList[hIdx];
+                log_info("Height: %d", height);
+                if (height > max_height) continue;
+                region[1] = height;
+                uint32_t numMipLevels = 1;
+                log_info("Number of mipmap levels: %d\n", numMipLevels);
+                magicValue++;
+                char *vkSrcBufferDeviceMemoryPtr =
+                    (char *);
+                uint64_t srcBufSize = 0;
+                memset(vkSrcBufferDeviceMemoryPtr, 0, maxImage2DSize);
+                memset(srcBufferPtr, 0, maxImage2DSize);
+                uint32_t mipLevel = 0;
+                for (uint32_t row = 0;
+                     row < std::max(height >> mipLevel, uint32_t(1)); row++)
+                {
+                    for (uint32_t col = 0;
+                         col < std::max(width >> mipLevel, uint32_t(1)); col++)
+                    {
+                        for (uint32_t elementByte = 0;
+                             elementByte < elementSize; elementByte++)
+                        {
+                            vkSrcBufferDeviceMemoryPtr[srcBufSize] =
+                                (char)(magicValue + mipLevel + row + col);
+                            srcBufferPtr[srcBufSize] =
+                                (char)(magicValue + mipLevel + row + col);
+                            srcBufSize++;
+                        }
+                    }
+                }
+                srcBufSize = ROUND_UP(
+                    srcBufSize,
+                    std::max(
+                        elementSize,
+                        (uint32_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT));
+                vkSrcBufferDeviceMemory.unmap();
+                for (size_t niIdx = 0; niIdx < ARRAY_SIZE(num2DImagesList);
+                     niIdx++)
+                {
+                    uint32_t num2DImages = num2DImagesList[niIdx] + 1;
+                    // added one image for cross-cq case for updateKernelCQ2
+                    log_info("Number of images: %d\n", num2DImages);
+                    ASSERT_LEQ(num2DImages, (uint32_t)MAX_2D_IMAGES);
+                    uint32_t num_2D_image;
+                    if (useSingleImageKernel)
+                    {
+                        num_2D_image = 1;
+                    }
+                    else
+                    {
+                        num_2D_image = num2DImages;
+                    }
+                    Params *params = (Params *);
+                    params->numImage2DDescriptors = num_2D_image * numMipLevels;
+                    vkParamsDeviceMemory.unmap();
+                    vkDescriptorSet.update(0, vkParamsBuffer);
+                    for (size_t emhtIdx = 0;
+                         emhtIdx < vkExternalMemoryHandleTypeList.size();
+                         emhtIdx++)
+                    {
+                        VulkanExternalMemoryHandleType
+                            vkExternalMemoryHandleType =
+                                vkExternalMemoryHandleTypeList[emhtIdx];
+                        log_info("External memory handle type: %d \n",
+                                 vkExternalMemoryHandleType);
+                        if ((true == disableNTHandleType)
+                            && (VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT
+                                == vkExternalMemoryHandleType))
+                        {
+                            // Skip running for WIN32 NT handle.
+                            continue;
+                        }
+                        VulkanImage2D vkDummyImage2D(
+                            vkDevice, vkFormatList[0], widthList[0],
+                            heightList[0], 1, vkExternalMemoryHandleType);
+                        const VulkanMemoryTypeList &memoryTypeList =
+                            vkDummyImage2D.getMemoryTypeList();
+                        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size();
+                             mtIdx++)
+                        {
+                            const VulkanMemoryType &memoryType =
+                                memoryTypeList[mtIdx];
+                            log_info("Memory type index: %d\n",
+                                     (uint32_t)memoryType);
+                            log_info("Memory type property: %d\n",
+                                     memoryType.getMemoryTypeProperty());
+                            if (!useDeviceLocal)
+                            {
+                                if (VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL
+                                    == memoryType.getMemoryTypeProperty())
+                                {
+                                    continue;
+                                }
+                            }
+                            size_t totalImageMemSize = 0;
+                            uint64_t interImageOffset = 0;
+                            {
+                                VulkanImage2D vkImage2D(
+                                    vkDevice, vkFormat, width, height,
+                                    numMipLevels, vkExternalMemoryHandleType);
+                                ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize);
+                                totalImageMemSize =
+                                    ROUND_UP(vkImage2D.getSize(),
+                                             vkImage2D.getAlignment());
+                            }
+                            VulkanImage2DList vkNonDedicatedImage2DList(
+                                num2DImages, vkDevice, vkFormat, width, height,
+                                numMipLevels, vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0; bIdx < num2DImages; bIdx++)
+                            {
+                                if (non_dedicated)
+                                {
+                                    vkNonDedicatedImage2DListDeviceMemory1
+                                        .push_back(new VulkanDeviceMemory(
+                                            vkDevice, totalImageMemSize,
+                                            memoryType,
+                                            vkExternalMemoryHandleType));
+                                }
+                                else
+                                {
+                                    vkNonDedicatedImage2DListDeviceMemory1
+                                        .push_back(new VulkanDeviceMemory(
+                                            vkDevice,
+                                            vkNonDedicatedImage2DList[bIdx],
+                                            memoryType,
+                                            vkExternalMemoryHandleType));
+                                }
+                                vkNonDedicatedImage2DListDeviceMemory1[bIdx]
+                                    ->bindImage(vkNonDedicatedImage2DList[bIdx],
+                                                0);
+                                nonDedicatedExternalMemory1.push_back(
+                                    new clExternalMemoryImage(
+                                        *vkNonDedicatedImage2DListDeviceMemory1
+                                            [bIdx],
+                                        vkExternalMemoryHandleType, context,
+                                        totalImageMemSize, width, height, 0,
+                                        vkNonDedicatedImage2DList[bIdx],
+                                        deviceId));
+                            }
+                            VulkanImageViewList vkNonDedicatedImage2DViewList(
+                                vkDevice, vkNonDedicatedImage2DList);
+                            VulkanImage2DList vkNonDedicatedImage2DList2(
+                                num2DImages, vkDevice, vkFormat, width, height,
+                                numMipLevels, vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0; bIdx < num2DImages; bIdx++)
+                            {
+                                if (non_dedicated)
+                                {
+                                    vkNonDedicatedImage2DListDeviceMemory2
+                                        .push_back(new VulkanDeviceMemory(
+                                            vkDevice, totalImageMemSize,
+                                            memoryType,
+                                            vkExternalMemoryHandleType));
+                                }
+                                else
+                                {
+                                    vkNonDedicatedImage2DListDeviceMemory2
+                                        .push_back(new VulkanDeviceMemory(
+                                            vkDevice,
+                                            vkNonDedicatedImage2DList2[bIdx],
+                                            memoryType,
+                                            vkExternalMemoryHandleType));
+                                }
+                                vkNonDedicatedImage2DListDeviceMemory2[bIdx]
+                                    ->bindImage(
+                                        vkNonDedicatedImage2DList2[bIdx], 0);
+                                nonDedicatedExternalMemory2.push_back(
+                                    new clExternalMemoryImage(
+                                        *vkNonDedicatedImage2DListDeviceMemory2
+                                            [bIdx],
+                                        vkExternalMemoryHandleType, context,
+                                        totalImageMemSize, width, height, 0,
+                                        vkNonDedicatedImage2DList2[bIdx],
+                                        deviceId));
+                            }
+                            VulkanImageViewList vkDedicatedImage2DViewList(
+                                vkDevice, vkNonDedicatedImage2DList2);
+                            cl_mem external_mem_image1[5];
+                            cl_mem external_mem_image2[5];
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                external_mem_image1[i] =
+                                    nonDedicatedExternalMemory1[i]
+                                        ->getExternalMemoryImage();
+                                external_mem_image2[i] =
+                                    nonDedicatedExternalMemory2[i]
+                                        ->getExternalMemoryImage();
+                            }
+                            VulkanImage2DList &vkImage2DList =
+                                vkNonDedicatedImage2DList;
+                            VulkanImageViewList &vkImage2DViewList =
+                                vkNonDedicatedImage2DViewList;
+                            clCl2VkExternalSemaphore->signal(cmd_queue1);
+                            if (!useSingleImageKernel)
+                            {
+                                for (size_t i2DIdx = 0;
+                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
+                                {
+                                    for (uint32_t mipLevel = 0;
+                                         mipLevel < numMipLevels; mipLevel++)
+                                    {
+                                        uint32_t i2DvIdx =
+                                            (uint32_t)(i2DIdx * numMipLevels)
+                                            + mipLevel;
+                                        vkDescriptorSet.update(
+                                            1 + i2DvIdx,
+                                            vkImage2DViewList[i2DvIdx]);
+                                    }
+                                }
+                                vkCopyCommandBuffer.begin();
+                                vkCopyCommandBuffer.pipelineBarrier(
+                                    vkImage2DList,
+                                    VULKAN_IMAGE_LAYOUT_UNDEFINED,
+                                    VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                for (size_t i2DIdx = 0;
+                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
+                                {
+                                    vkCopyCommandBuffer.copyBufferToImage(
+                                        vkSrcBuffer, vkImage2DList[i2DIdx],
+                                        VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                }
+                                vkCopyCommandBuffer.pipelineBarrier(
+                                    vkImage2DList,
+                                    VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                    VULKAN_IMAGE_LAYOUT_GENERAL);
+                                vkCopyCommandBuffer.end();
+                                memset(dstBufferPtr, 0, srcBufSize);
+                                vkQueue.submit(vkCopyCommandBuffer);
+                                vkShaderCommandBuffer.begin();
+                                vkShaderCommandBuffer.bindPipeline(
+                                    vkComputePipeline);
+                                vkShaderCommandBuffer.bindDescriptorSets(
+                                    vkComputePipeline, vkPipelineLayout,
+                                    vkDescriptorSet);
+                                vkShaderCommandBuffer.dispatch(
+                                    NUM_BLOCKS(width, NUM_THREADS_PER_GROUP_X),
+                                    NUM_BLOCKS(height,
+                                               NUM_THREADS_PER_GROUP_Y / 2),
+                                    1);
+                                vkShaderCommandBuffer.end();
+                            }
+                            for (uint32_t iter = 0; iter < innerIterations;
+                                 iter++)
+                            {
+                                if (useSingleImageKernel)
+                                {
+                                    for (size_t i2DIdx = 0;
+                                         i2DIdx < vkImage2DList.size();
+                                         i2DIdx++)
+                                    {
+                                        vkDescriptorSet.update(
+                                            1, vkImage2DViewList[i2DIdx]);
+                                        vkCopyCommandBuffer.begin();
+                                        vkCopyCommandBuffer.pipelineBarrier(
+                                            vkImage2DList,
+                                            VULKAN_IMAGE_LAYOUT_UNDEFINED,
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                        vkCopyCommandBuffer.copyBufferToImage(
+                                            vkSrcBuffer, vkImage2DList[i2DIdx],
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                        vkCopyCommandBuffer.pipelineBarrier(
+                                            vkImage2DList,
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                            VULKAN_IMAGE_LAYOUT_GENERAL);
+                                        vkCopyCommandBuffer.end();
+                                        memset(dstBufferPtr, 0, srcBufSize);
+                                        vkQueue.submit(vkCopyCommandBuffer);
+                                        vkShaderCommandBuffer.begin();
+                                        vkShaderCommandBuffer.bindPipeline(
+                                            vkComputePipeline);
+                                        vkShaderCommandBuffer
+                                            .bindDescriptorSets(
+                                                vkComputePipeline,
+                                                vkPipelineLayout,
+                                                vkDescriptorSet);
+                                        vkShaderCommandBuffer.dispatch(
+                                            NUM_BLOCKS(width,
+                                                       NUM_THREADS_PER_GROUP_X),
+                                            NUM_BLOCKS(height,
+                                                       NUM_THREADS_PER_GROUP_Y
+                                                           / 2),
+                                            1);
+                                        vkShaderCommandBuffer.end();
+                                        if (i2DIdx < vkImage2DList.size() - 1)
+                                        {
+                                            vkQueue.submit(
+                                                vkShaderCommandBuffer);
+                                        }
+                                    }
+                                }
+                                vkQueue.submit(vkCl2VkSemaphore,
+                                               vkShaderCommandBuffer,
+                                               vkVk2CLSemaphore);
+                                clVk2CLExternalSemaphore->wait(cmd_queue1);
+                                switch (num2DImages)
+                                {
+                                    case 2:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[0],
+                                            kernel_signed[0],
+                                            kernel_unsigned[0]);
+                                        break;
+                                    case 3:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[1],
+                                            kernel_signed[1],
+                                            kernel_unsigned[1]);
+                                        break;
+                                    case 5:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[2],
+                                            kernel_signed[2],
+                                            kernel_unsigned[2]);
+                                        break;
+                                }
+                                updateKernelCQ2 = getKernelType(
+                                    vkFormat, kernel_float[3], kernel_signed[3],
+                                    kernel_unsigned[3]);
+                                // similar kernel-type based on vkFormat
+                                int j = 0;
+                                // Setting arguments of updateKernelCQ2
+                                err = clSetKernelArg(updateKernelCQ2, 0,
+                                                     sizeof(cl_mem),
+                                                     &external_mem_image1[0]);
+                                err |= clSetKernelArg(updateKernelCQ2, 1,
+                                                      sizeof(cl_mem),
+                                                      &external_mem_image2[0]);
+                                err |= clSetKernelArg(
+                                    updateKernelCQ2, 2, sizeof(cl_mem),
+                                    &external_mem_image1[num2DImages - 1]);
+                                err |= clSetKernelArg(
+                                    updateKernelCQ2, 3, sizeof(cl_mem),
+                                    &external_mem_image2[num2DImages - 1]);
+                                err |= clSetKernelArg(updateKernelCQ2, 4,
+                                                      sizeof(unsigned int),
+                                                      &num2DImages);
+                                err |= clSetKernelArg(updateKernelCQ2, 5,
+                                                      sizeof(unsigned int),
+                                                      &width);
+                                err |= clSetKernelArg(updateKernelCQ2, 6,
+                                                      sizeof(unsigned int),
+                                                      &height);
+                                err |= clSetKernelArg(updateKernelCQ2, 7,
+                                                      sizeof(unsigned int),
+                                                      &numMipLevels);
+                                for (int i = 0; i < num2DImages - 1; i++, ++j)
+                                {
+                                    err = clSetKernelArg(
+                                        updateKernelCQ1, j, sizeof(cl_mem),
+                                        &external_mem_image1[i]);
+                                    err |= clSetKernelArg(
+                                        updateKernelCQ1, ++j, sizeof(cl_mem),
+                                        &external_mem_image2[i]);
+                                }
+                                err |= clSetKernelArg(updateKernelCQ1, j,
+                                                      sizeof(unsigned int),
+                                                      &num2DImages);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &width);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &height);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &numMipLevels);
+                                if (err != CL_SUCCESS)
+                                {
+                                    print_error(
+                                        err,
+                                        "Error: Failed to set arg values \n");
+                                    goto CLEANUP;
+                                }
+                                // clVk2CLExternalSemaphore->wait(cmd_queue1);
+                                size_t global_work_size[3] = { width, height,
+                                                               1 };
+                                cl_event first_launch;
+                                err = clEnqueueNDRangeKernel(
+                                    cmd_queue1, updateKernelCQ1, 2, NULL,
+                                    global_work_size, NULL, 0, NULL,
+                                    &first_launch);
+                                if (err != CL_SUCCESS)
+                                {
+                                    goto CLEANUP;
+                                }
+                                err = clEnqueueNDRangeKernel(
+                                    cmd_queue2, updateKernelCQ2, 2, NULL,
+                                    global_work_size, NULL, 1, &first_launch,
+                                    NULL);
+                                if (err != CL_SUCCESS)
+                                {
+                                    goto CLEANUP;
+                                }
+                                clFinish(cmd_queue2);
+                                clCl2VkExternalSemaphore->signal(cmd_queue2);
+                            }
+                            unsigned int flags = 0;
+                            size_t mipmapLevelOffset = 0;
+                            cl_event eventReadImage = NULL;
+                            clFinish(cmd_queue2);
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                err = clEnqueueReadImage(
+                                    cmd_queue1, external_mem_image2[i], CL_TRUE,
+                                    origin, region, 0, 0, dstBufferPtr, 0, NULL,
+                                    &eventReadImage);
+                                if (err != CL_SUCCESS)
+                                {
+                                    print_error(err,
+                                                "clEnqueueReadImage failed with"
+                                                "error\n");
+                                }
+                                if (memcmp(srcBufferPtr, dstBufferPtr,
+                                           srcBufSize))
+                                {
+                                    log_info("Source and destination buffers "
+                                             "don't match\n");
+                                    if (debug_trace)
+                                    {
+                                        log_info("Source buffer contents: \n");
+                                        for (uint64_t sIdx = 0;
+                                             sIdx < srcBufSize; sIdx++)
+                                        {
+                                            log_info(
+                                                "%d ",
+                                                (int)vkSrcBufferDeviceMemoryPtr
+                                                    [sIdx]);
+                                        }
+                                        log_info("Destination buffer contents:"
+                                                 "\n");
+                                        for (uint64_t dIdx = 0;
+                                             dIdx < srcBufSize; dIdx++)
+                                        {
+                                            log_info("%d ",
+                                                     (int)dstBufferPtr[dIdx]);
+                                        }
+                                    }
+                                    err = -1;
+                                    break;
+                                }
+                            }
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                delete vkNonDedicatedImage2DListDeviceMemory1
+                                    [i];
+                                delete vkNonDedicatedImage2DListDeviceMemory2
+                                    [i];
+                                delete nonDedicatedExternalMemory1[i];
+                                delete nonDedicatedExternalMemory2[i];
+                            }
+                            vkNonDedicatedImage2DListDeviceMemory1.erase(
+                                vkNonDedicatedImage2DListDeviceMemory1.begin(),
+                                vkNonDedicatedImage2DListDeviceMemory1.begin()
+                                    + num2DImages);
+                            vkNonDedicatedImage2DListDeviceMemory2.erase(
+                                vkNonDedicatedImage2DListDeviceMemory2.begin(),
+                                vkNonDedicatedImage2DListDeviceMemory2.begin()
+                                    + num2DImages);
+                            nonDedicatedExternalMemory1.erase(
+                                nonDedicatedExternalMemory1.begin(),
+                                nonDedicatedExternalMemory1.begin()
+                                    + num2DImages);
+                            nonDedicatedExternalMemory2.erase(
+                                nonDedicatedExternalMemory2.begin(),
+                                nonDedicatedExternalMemory2.begin()
+                                    + num2DImages);
+                            if (CL_SUCCESS != err)
+                            {
+                                goto CLEANUP;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        vkImage2DShader.clear();
+    }
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (srcBufferPtr) free(srcBufferPtr);
+    if (dstBufferPtr) free(dstBufferPtr);
+    return err;
+int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
+                            cl_kernel *kernel_unsigned,
+                            cl_kernel *kernel_signed, cl_kernel *kernel_float,
+                            VulkanDevice &vkDevice)
+    cl_int err = CL_SUCCESS;
+    size_t origin[3] = { 0, 0, 0 };
+    size_t region[3] = { 1, 1, 1 };
+    cl_kernel updateKernelCQ1;
+    std::vector<VulkanFormat> vkFormatList = getSupportedVulkanFormatList();
+    const std::vector<VulkanExternalMemoryHandleType>
+        vkExternalMemoryHandleTypeList =
+            getSupportedVulkanExternalMemoryHandleTypeList();
+    char magicValue = 0;
+    VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params));
+    VulkanDeviceMemory vkParamsDeviceMemory(
+        vkDevice, vkParamsBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkParamsDeviceMemory.bindBuffer(vkParamsBuffer);
+    uint64_t maxImage2DSize =
+        max_width * max_height * MAX_2D_IMAGE_ELEMENT_SIZE * 2;
+    VulkanBuffer vkSrcBuffer(vkDevice, maxImage2DSize);
+    VulkanDeviceMemory vkSrcBufferDeviceMemory(
+        vkDevice, vkSrcBuffer.getSize(),
+        getVulkanMemoryType(vkDevice,
+    vkSrcBufferDeviceMemory.bindBuffer(vkSrcBuffer);
+    char *srcBufferPtr, *dstBufferPtr;
+    srcBufferPtr = (char *)malloc(maxImage2DSize);
+    dstBufferPtr = (char *)malloc(maxImage2DSize);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
+    VulkanDescriptorSetLayout vkDescriptorSetLayout(
+        vkDevice, vkDescriptorSetLayoutBindingList);
+    VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
+    VulkanDescriptorPool vkDescriptorPool(vkDevice,
+                                          vkDescriptorSetLayoutBindingList);
+    VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
+                                        vkDescriptorSetLayout);
+    VulkanCommandPool vkCommandPool(vkDevice);
+    VulkanCommandBuffer vkCopyCommandBuffer(vkDevice, vkCommandPool);
+    VulkanCommandBuffer vkShaderCommandBuffer(vkDevice, vkCommandPool);
+    VulkanQueue &vkQueue = vkDevice.getQueue();
+    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
+        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
+    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
+    clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
+    clVk2CLExternalSemaphore = new clExternalSemaphore(
+        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    clCl2VkExternalSemaphore = new clExternalSemaphore(
+        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1;
+    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2;
+    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1;
+    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2;
+    std::vector<char> vkImage2DShader;
+    for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++)
+    {
+        VulkanFormat vkFormat = vkFormatList[fIdx];
+        log_info("Format: %d\n", vkFormat);
+        uint32_t elementSize = getVulkanFormatElementSize(vkFormat);
+        ASSERT_LEQ(elementSize, (uint32_t)MAX_2D_IMAGE_ELEMENT_SIZE);
+        log_info("elementSize= %d\n", elementSize);
+        std::string fileName = "image2D_"
+            + std::string(getVulkanFormatGLSLFormat(vkFormat)) + ".spv";
+        log_info("Load %s file", fileName.c_str());
+        vkImage2DShader = readFile(fileName);
+        VulkanShaderModule vkImage2DShaderModule(vkDevice, vkImage2DShader);
+        VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout,
+                                                vkImage2DShaderModule);
+        for (size_t wIdx = 0; wIdx < ARRAY_SIZE(widthList); wIdx++)
+        {
+            uint32_t width = widthList[wIdx];
+            log_info("Width: %d\n", width);
+            if (width > max_width) continue;
+            region[0] = width;
+            for (size_t hIdx = 0; hIdx < ARRAY_SIZE(heightList); hIdx++)
+            {
+                uint32_t height = heightList[hIdx];
+                log_info("Height: %d\n", height);
+                if (height > max_height) continue;
+                region[1] = height;
+                uint32_t numMipLevels = 1;
+                log_info("Number of mipmap levels: %d\n", numMipLevels);
+                magicValue++;
+                char *vkSrcBufferDeviceMemoryPtr =
+                    (char *);
+                uint64_t srcBufSize = 0;
+                memset(vkSrcBufferDeviceMemoryPtr, 0, maxImage2DSize);
+                memset(srcBufferPtr, 0, maxImage2DSize);
+                uint32_t mipLevel = 0;
+                for (uint32_t row = 0;
+                     row < std::max(height >> mipLevel, uint32_t(1)); row++)
+                {
+                    for (uint32_t col = 0;
+                         col < std::max(width >> mipLevel, uint32_t(1)); col++)
+                    {
+                        for (uint32_t elementByte = 0;
+                             elementByte < elementSize; elementByte++)
+                        {
+                            vkSrcBufferDeviceMemoryPtr[srcBufSize] =
+                                (char)(magicValue + mipLevel + row + col);
+                            srcBufferPtr[srcBufSize] =
+                                (char)(magicValue + mipLevel + row + col);
+                            srcBufSize++;
+                        }
+                    }
+                }
+                srcBufSize = ROUND_UP(
+                    srcBufSize,
+                    std::max(
+                        elementSize,
+                        (uint32_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT));
+                vkSrcBufferDeviceMemory.unmap();
+                for (size_t niIdx = 0; niIdx < ARRAY_SIZE(num2DImagesList);
+                     niIdx++)
+                {
+                    uint32_t num2DImages = num2DImagesList[niIdx];
+                    log_info("Number of images: %d\n", num2DImages);
+                    ASSERT_LEQ(num2DImages, (uint32_t)MAX_2D_IMAGES);
+                    Params *params = (Params *);
+                    uint32_t num_2D_image;
+                    if (useSingleImageKernel)
+                    {
+                        num_2D_image = 1;
+                    }
+                    else
+                    {
+                        num_2D_image = num2DImages;
+                    }
+                    params->numImage2DDescriptors = num_2D_image * numMipLevels;
+                    vkParamsDeviceMemory.unmap();
+                    vkDescriptorSet.update(0, vkParamsBuffer);
+                    for (size_t emhtIdx = 0;
+                         emhtIdx < vkExternalMemoryHandleTypeList.size();
+                         emhtIdx++)
+                    {
+                        VulkanExternalMemoryHandleType
+                            vkExternalMemoryHandleType =
+                                vkExternalMemoryHandleTypeList[emhtIdx];
+                        log_info("External memory handle type: %d \n",
+                                 vkExternalMemoryHandleType);
+                        if ((true == disableNTHandleType)
+                            && (VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT
+                                == vkExternalMemoryHandleType))
+                        {
+                            // Skip running for WIN32 NT handle.
+                            continue;
+                        }
+                        VulkanImage2D vkDummyImage2D(
+                            vkDevice, vkFormatList[0], widthList[0],
+                            heightList[0], 1, vkExternalMemoryHandleType);
+                        const VulkanMemoryTypeList &memoryTypeList =
+                            vkDummyImage2D.getMemoryTypeList();
+                        for (size_t mtIdx = 0; mtIdx < memoryTypeList.size();
+                             mtIdx++)
+                        {
+                            const VulkanMemoryType &memoryType =
+                                memoryTypeList[mtIdx];
+                            log_info("Memory type index: %d\n",
+                                     (uint32_t)memoryType);
+                            log_info("Memory type property: %d\n",
+                                     memoryType.getMemoryTypeProperty());
+                            if (!useDeviceLocal)
+                            {
+                                if (VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL
+                                    == memoryType.getMemoryTypeProperty())
+                                {
+                                    continue;
+                                }
+                            }
+                            size_t totalImageMemSize = 0;
+                            uint64_t interImageOffset = 0;
+                            {
+                                VulkanImage2D vkImage2D(
+                                    vkDevice, vkFormat, width, height,
+                                    numMipLevels, vkExternalMemoryHandleType);
+                                ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize);
+                                totalImageMemSize =
+                                    ROUND_UP(vkImage2D.getSize(),
+                                             vkImage2D.getAlignment());
+                            }
+                            VulkanImage2DList vkNonDedicatedImage2DList(
+                                num2DImages, vkDevice, vkFormat, width, height,
+                                numMipLevels, vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0;
+                                 bIdx < vkNonDedicatedImage2DList.size();
+                                 bIdx++)
+                            {
+                                // Create list of Vulkan device memories and
+                                // bind the list of Vulkan images.
+                                vkNonDedicatedImage2DListDeviceMemory1
+                                    .push_back(new VulkanDeviceMemory(
+                                        vkDevice, totalImageMemSize, memoryType,
+                                        vkExternalMemoryHandleType));
+                                vkNonDedicatedImage2DListDeviceMemory1[bIdx]
+                                    ->bindImage(vkNonDedicatedImage2DList[bIdx],
+                                                0);
+                                nonDedicatedExternalMemory1.push_back(
+                                    new clExternalMemoryImage(
+                                        *vkNonDedicatedImage2DListDeviceMemory1
+                                            [bIdx],
+                                        vkExternalMemoryHandleType, context,
+                                        totalImageMemSize, width, height, 0,
+                                        vkNonDedicatedImage2DList[bIdx],
+                                        deviceId));
+                            }
+                            VulkanImageViewList vkNonDedicatedImage2DViewList(
+                                vkDevice, vkNonDedicatedImage2DList);
+                            VulkanImage2DList vkNonDedicatedImage2DList2(
+                                num2DImages, vkDevice, vkFormat, width, height,
+                                numMipLevels, vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0;
+                                 bIdx < vkNonDedicatedImage2DList2.size();
+                                 bIdx++)
+                            {
+                                vkNonDedicatedImage2DListDeviceMemory2
+                                    .push_back(new VulkanDeviceMemory(
+                                        vkDevice, totalImageMemSize, memoryType,
+                                        vkExternalMemoryHandleType));
+                                vkNonDedicatedImage2DListDeviceMemory2[bIdx]
+                                    ->bindImage(
+                                        vkNonDedicatedImage2DList2[bIdx], 0);
+                                nonDedicatedExternalMemory2.push_back(
+                                    new clExternalMemoryImage(
+                                        *vkNonDedicatedImage2DListDeviceMemory2
+                                            [bIdx],
+                                        vkExternalMemoryHandleType, context,
+                                        totalImageMemSize, width, height, 0,
+                                        vkNonDedicatedImage2DList2[bIdx],
+                                        deviceId));
+                            }
+                            VulkanImageViewList vkDedicatedImage2DViewList(
+                                vkDevice, vkNonDedicatedImage2DList2);
+                            cl_mem external_mem_image1[4];
+                            cl_mem external_mem_image2[4];
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                external_mem_image1[i] =
+                                    nonDedicatedExternalMemory1[i]
+                                        ->getExternalMemoryImage();
+                                external_mem_image2[i] =
+                                    nonDedicatedExternalMemory2[i]
+                                        ->getExternalMemoryImage();
+                            }
+                            VulkanImage2DList &vkImage2DList =
+                                vkNonDedicatedImage2DList;
+                            VulkanImageViewList &vkImage2DViewList =
+                                vkNonDedicatedImage2DViewList;
+                            clCl2VkExternalSemaphore->signal(cmd_queue1);
+                            if (!useSingleImageKernel)
+                            {
+                                for (size_t i2DIdx = 0;
+                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
+                                {
+                                    for (uint32_t mipLevel = 0;
+                                         mipLevel < numMipLevels; mipLevel++)
+                                    {
+                                        uint32_t i2DvIdx =
+                                            (uint32_t)(i2DIdx * numMipLevels)
+                                            + mipLevel;
+                                        vkDescriptorSet.update(
+                                            1 + i2DvIdx,
+                                            vkImage2DViewList[i2DvIdx]);
+                                    }
+                                }
+                                vkCopyCommandBuffer.begin();
+                                vkCopyCommandBuffer.pipelineBarrier(
+                                    vkImage2DList,
+                                    VULKAN_IMAGE_LAYOUT_UNDEFINED,
+                                    VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                for (size_t i2DIdx = 0;
+                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
+                                {
+                                    vkCopyCommandBuffer.copyBufferToImage(
+                                        vkSrcBuffer, vkImage2DList[i2DIdx],
+                                        VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                }
+                                vkCopyCommandBuffer.pipelineBarrier(
+                                    vkImage2DList,
+                                    VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                    VULKAN_IMAGE_LAYOUT_GENERAL);
+                                vkCopyCommandBuffer.end();
+                                memset(dstBufferPtr, 0, srcBufSize);
+                                vkQueue.submit(vkCopyCommandBuffer);
+                                vkShaderCommandBuffer.begin();
+                                vkShaderCommandBuffer.bindPipeline(
+                                    vkComputePipeline);
+                                vkShaderCommandBuffer.bindDescriptorSets(
+                                    vkComputePipeline, vkPipelineLayout,
+                                    vkDescriptorSet);
+                                vkShaderCommandBuffer.dispatch(
+                                    NUM_BLOCKS(width, NUM_THREADS_PER_GROUP_X),
+                                    NUM_BLOCKS(height,
+                                               NUM_THREADS_PER_GROUP_Y / 2),
+                                    1);
+                                vkShaderCommandBuffer.end();
+                            }
+                            for (uint32_t iter = 0; iter < innerIterations;
+                                 iter++)
+                            {
+                                if (useSingleImageKernel)
+                                {
+                                    for (size_t i2DIdx = 0;
+                                         i2DIdx < vkImage2DList.size();
+                                         i2DIdx++)
+                                    {
+                                        vkDescriptorSet.update(
+                                            1, vkImage2DViewList[i2DIdx]);
+                                        vkCopyCommandBuffer.begin();
+                                        vkCopyCommandBuffer.pipelineBarrier(
+                                            vkImage2DList,
+                                            VULKAN_IMAGE_LAYOUT_UNDEFINED,
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                        vkCopyCommandBuffer.copyBufferToImage(
+                                            vkSrcBuffer, vkImage2DList[i2DIdx],
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+                                        vkCopyCommandBuffer.pipelineBarrier(
+                                            vkImage2DList,
+                                            VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                            VULKAN_IMAGE_LAYOUT_GENERAL);
+                                        vkCopyCommandBuffer.end();
+                                        memset(dstBufferPtr, 0, srcBufSize);
+                                        vkQueue.submit(vkCopyCommandBuffer);
+                                        vkShaderCommandBuffer.begin();
+                                        vkShaderCommandBuffer.bindPipeline(
+                                            vkComputePipeline);
+                                        vkShaderCommandBuffer
+                                            .bindDescriptorSets(
+                                                vkComputePipeline,
+                                                vkPipelineLayout,
+                                                vkDescriptorSet);
+                                        vkShaderCommandBuffer.dispatch(
+                                            NUM_BLOCKS(width,
+                                                       NUM_THREADS_PER_GROUP_X),
+                                            NUM_BLOCKS(height,
+                                                       NUM_THREADS_PER_GROUP_Y
+                                                           / 2),
+                                            1);
+                                        vkShaderCommandBuffer.end();
+                                        if (i2DIdx < vkImage2DList.size() - 1)
+                                        {
+                                            vkQueue.submit(
+                                                vkShaderCommandBuffer);
+                                        }
+                                    }
+                                }
+                                vkQueue.submit(vkCl2VkSemaphore,
+                                               vkShaderCommandBuffer,
+                                               vkVk2CLSemaphore);
+                                clVk2CLExternalSemaphore->wait(cmd_queue1);
+                                switch (num2DImages)
+                                {
+                                    case 1:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[0],
+                                            kernel_signed[0],
+                                            kernel_unsigned[0]);
+                                        break;
+                                    case 2:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[1],
+                                            kernel_signed[1],
+                                            kernel_unsigned[1]);
+                                        break;
+                                    case 4:
+                                        updateKernelCQ1 = getKernelType(
+                                            vkFormat, kernel_float[2],
+                                            kernel_signed[2],
+                                            kernel_unsigned[2]);
+                                        break;
+                                }
+                                int j = 0;
+                                for (int i = 0; i < num2DImages; i++, ++j)
+                                {
+                                    err = clSetKernelArg(
+                                        updateKernelCQ1, j, sizeof(cl_mem),
+                                        &external_mem_image1[i]);
+                                    err |= clSetKernelArg(
+                                        updateKernelCQ1, ++j, sizeof(cl_mem),
+                                        &external_mem_image2[i]);
+                                }
+                                err |= clSetKernelArg(updateKernelCQ1, j,
+                                                      sizeof(unsigned int),
+                                                      &num2DImages);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &width);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &height);
+                                err |= clSetKernelArg(updateKernelCQ1, ++j,
+                                                      sizeof(unsigned int),
+                                                      &numMipLevels);
+                                if (err != CL_SUCCESS)
+                                {
+                                    print_error(err,
+                                                "Error: Failed to set arg "
+                                                "values for kernel-1\n");
+                                    goto CLEANUP;
+                                }
+                                size_t global_work_size[3] = { width, height,
+                                                               1 };
+                                err = clEnqueueNDRangeKernel(
+                                    cmd_queue1, updateKernelCQ1, 2, NULL,
+                                    global_work_size, NULL, 0, NULL, NULL);
+                                if (err != CL_SUCCESS)
+                                {
+                                    goto CLEANUP;
+                                }
+                                clCl2VkExternalSemaphore->signal(cmd_queue1);
+                            }
+                            unsigned int flags = 0;
+                            size_t mipmapLevelOffset = 0;
+                            cl_event eventReadImage = NULL;
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                err = clEnqueueReadImage(
+                                    cmd_queue1, external_mem_image2[i], CL_TRUE,
+                                    origin, region, 0, 0, dstBufferPtr, 0, NULL,
+                                    &eventReadImage);
+                                if (err != CL_SUCCESS)
+                                {
+                                    print_error(err,
+                                                "clEnqueueReadImage failed with"
+                                                "error\n");
+                                }
+                                if (memcmp(srcBufferPtr, dstBufferPtr,
+                                           srcBufSize))
+                                {
+                                    log_info("Source and destination buffers "
+                                             "don't match\n");
+                                    if (debug_trace)
+                                    {
+                                        log_info("Source buffer contents: \n");
+                                        for (uint64_t sIdx = 0;
+                                             sIdx < srcBufSize; sIdx++)
+                                        {
+                                            log_info(
+                                                "%d",
+                                                (int)vkSrcBufferDeviceMemoryPtr
+                                                    [sIdx]);
+                                        }
+                                        log_info(
+                                            "Destination buffer contents:");
+                                        for (uint64_t dIdx = 0;
+                                             dIdx < srcBufSize; dIdx++)
+                                        {
+                                            log_info("%d",
+                                                     (int)dstBufferPtr[dIdx]);
+                                        }
+                                    }
+                                    err = -1;
+                                    break;
+                                }
+                            }
+                            for (int i = 0; i < num2DImages; i++)
+                            {
+                                delete vkNonDedicatedImage2DListDeviceMemory1
+                                    [i];
+                                delete vkNonDedicatedImage2DListDeviceMemory2
+                                    [i];
+                                delete nonDedicatedExternalMemory1[i];
+                                delete nonDedicatedExternalMemory2[i];
+                            }
+                            vkNonDedicatedImage2DListDeviceMemory1.erase(
+                                vkNonDedicatedImage2DListDeviceMemory1.begin(),
+                                vkNonDedicatedImage2DListDeviceMemory1.begin()
+                                    + num2DImages);
+                            vkNonDedicatedImage2DListDeviceMemory2.erase(
+                                vkNonDedicatedImage2DListDeviceMemory2.begin(),
+                                vkNonDedicatedImage2DListDeviceMemory2.begin()
+                                    + num2DImages);
+                            nonDedicatedExternalMemory1.erase(
+                                nonDedicatedExternalMemory1.begin(),
+                                nonDedicatedExternalMemory1.begin()
+                                    + num2DImages);
+                            nonDedicatedExternalMemory2.erase(
+                                nonDedicatedExternalMemory2.begin(),
+                                nonDedicatedExternalMemory2.begin()
+                                    + num2DImages);
+                            if (CL_SUCCESS != err)
+                            {
+                                goto CLEANUP;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        vkImage2DShader.clear();
+    }
+    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (srcBufferPtr) free(srcBufferPtr);
+    if (dstBufferPtr) free(dstBufferPtr);
+    return err;
+int test_image_common(cl_device_id device_, cl_context context_,
+                      cl_command_queue queue_, int numElements_)
+    int current_device = 0;
+    int device_count = 0;
+    int devices_prohibited = 0;
+    cl_int err = CL_SUCCESS;
+    cl_platform_id platform = NULL;
+    size_t extensionSize = 0;
+    cl_uint num_devices = 0;
+    cl_uint device_no = 0;
+    cl_device_id *devices;
+    char *extensions = NULL;
+    const char *program_source_const;
+    cl_command_queue cmd_queue1 = NULL;
+    cl_command_queue cmd_queue2 = NULL;
+    cl_context context = NULL;
+    const uint32_t num_kernels = ARRAY_SIZE(num2DImagesList) + 1;
+    // One kernel for Cross-CQ case
+    const uint32_t num_kernel_types = 3;
+    const char *kernel_source[num_kernels] = { kernel_text_numImage_1,
+                                               kernel_text_numImage_2,
+                                               kernel_text_numImage_4 };
+    char source_1[4096];
+    char source_2[4096];
+    char source_3[4096];
+    size_t program_source_length;
+    cl_program program[num_kernel_types];
+    cl_kernel kernel_float[num_kernels] = { NULL, NULL, NULL, NULL };
+    cl_kernel kernel_signed[num_kernels] = { NULL, NULL, NULL, NULL };
+    cl_kernel kernel_unsigned[num_kernels] = { NULL, NULL, NULL, NULL };
+    cl_mem external_mem_image1;
+    cl_mem external_mem_image2;
+    VulkanDevice vkDevice;
+    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    // get the platform ID
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS)
+    {
+        print_error(err, "Error: Failed to get platform\n");
+        goto CLEANUP;
+    }
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "clGetDeviceIDs failed in returning no. of devices\n");
+        goto CLEANUP;
+    }
+    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+    if (NULL == devices)
+    {
+        err = CL_OUT_OF_HOST_MEMORY;
+        print_error(err, "Unable to allocate memory for devices\n");
+        goto CLEANUP;
+    }
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices,
+                         NULL);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "Failed to get deviceID.\n");
+        goto CLEANUP;
+    }
+    contextProperties[1] = (cl_context_properties)platform;
+    log_info("Assigned contextproperties for platform\n");
+    for (device_no = 0; device_no < num_devices; device_no++)
+    {
+        err = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0, NULL,
+                              &extensionSize);
+        if (CL_SUCCESS != err)
+        {
+            print_error(
+                err,
+                "Error in clGetDeviceInfo for getting device_extension size\n");
+            goto CLEANUP;
+        }
+        extensions = (char *)malloc(extensionSize);
+        if (NULL == extensions)
+        {
+            err = CL_OUT_OF_HOST_MEMORY;
+            print_error(err, "Unable to allocate memory for extensions\n");
+            goto CLEANUP;
+        }
+        err = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS,
+                              extensionSize, extensions, NULL);
+        if (CL_SUCCESS != err)
+        {
+            print_error(
+                err, "Error in clGetDeviceInfo for getting device_extension\n");
+            goto CLEANUP;
+        }
+        err = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR,
+                              CL_UUID_SIZE_KHR, uuid, &extensionSize);
+        if (CL_SUCCESS != err)
+        {
+            print_error(err, "clGetDeviceInfo failed with error");
+            goto CLEANUP;
+        }
+        err =
+            memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE);
+        if (err == 0)
+        {
+            break;
+        }
+    }
+    if (device_no >= num_devices)
+    {
+        err = EXIT_FAILURE;
+        print_error(err,
+                    "OpenCL error:"
+                    "No Vulkan-OpenCL Interop capable GPU found.\n");
+        goto CLEANUP;
+    }
+    deviceId = devices[device_no];
+    err = setMaxImageDimensions(deviceId, max_width, max_height);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "error setting max image dimensions");
+        goto CLEANUP;
+    }
+    log_info("Set max_width to %lu and max_height to %lu\n", max_width,
+             max_height);
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &err);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "error creating context");
+        goto CLEANUP;
+    }
+    log_info("Successfully created context !!!\n");
+    cmd_queue1 = clCreateCommandQueue(context, devices[device_no], 0, &err);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "Error: Failed to create command queue!\n");
+        goto CLEANUP;
+    }
+    log_info("clCreateCommandQueue successfull \n");
+    cmd_queue2 = clCreateCommandQueue(context, devices[device_no], 0, &err);
+    if (CL_SUCCESS != err)
+    {
+        print_error(err, "Error: Failed to create command queue!\n");
+        goto CLEANUP;
+    }
+    log_info("clCreateCommandQueue2 successful \n");
+    for (int i = 0; i < num_kernels; i++)
+    {
+        switch (i)
+        {
+            case 0:
+                sprintf(source_1, kernel_source[i], "float4", "f", "float4",
+                        "f", "f", "f");
+                sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i",
+                        "i", "i");
+                sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4",
+                        "ui", "ui", "ui");
+                break;
+            case 1:
+                sprintf(source_1, kernel_source[i], "float4", "f", "float4",
+                        "f", "float4", "f", "float4", "f", "f", "f", "f", "f");
+                sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i",
+                        "int4", "i", "int4", "i", "i", "i", "i", "i");
+                sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4",
+                        "ui", "uint4", "ui", "uint4", "ui", "ui", "ui", "ui",
+                        "ui");
+                break;
+            case 2:
+                sprintf(source_1, kernel_source[i], "float4", "f", "float4",
+                        "f", "float4", "f", "float4", "f", "float4", "f",
+                        "float4", "f", "float4", "f", "float4", "f", "f", "f",
+                        "f", "f", "f", "f", "f", "f");
+                sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i",
+                        "int4", "i", "int4", "i", "int4", "i", "int4", "i",
+                        "int4", "i", "int4", "i", "i", "i", "i", "i", "i", "i",
+                        "i", "i");
+                sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4",
+                        "ui", "uint4", "ui", "uint4", "ui", "uint4", "ui",
+                        "uint4", "ui", "uint4", "ui", "uint4", "ui", "ui", "ui",
+                        "ui", "ui", "ui", "ui", "ui", "ui");
+                break;
+            case 3:
+                // Addtional case for creating updateKernelCQ2 which takes two
+                // images
+                sprintf(source_1, kernel_source[1], "float4", "f", "float4",
+                        "f", "float4", "f", "float4", "f", "f", "f", "f", "f");
+                sprintf(source_2, kernel_source[1], "int4", "i", "int4", "i",
+                        "int4", "i", "int4", "i", "i", "i", "i", "i");
+                sprintf(source_3, kernel_source[1], "uint4", "ui", "uint4",
+                        "ui", "uint4", "ui", "uint4", "ui", "ui", "ui", "ui",
+                        "ui");
+                break;
+        }
+        const char *sourceTexts[num_kernel_types] = { source_1, source_2,
+                                                      source_3 };
+        for (int k = 0; k < num_kernel_types; k++)
+        {
+            program_source_length = strlen(sourceTexts[k]);
+            program[k] = clCreateProgramWithSource(
+                context, 1, &sourceTexts[k], &program_source_length, &err);
+            err |= clBuildProgram(program[k], 0, NULL, NULL, NULL, NULL);
+        }
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Error: Failed to build program");
+            goto CLEANUP;
+        }
+        // create the kernel
+        kernel_float[i] = clCreateKernel(program[0], "image2DKernel", &err);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clCreateKernel failed");
+            goto CLEANUP;
+        }
+        kernel_signed[i] = clCreateKernel(program[1], "image2DKernel", &err);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clCreateKernel failed");
+            goto CLEANUP;
+        }
+        kernel_unsigned[i] = clCreateKernel(program[2], "image2DKernel", &err);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clCreateKernel failed ");
+            goto CLEANUP;
+        }
+    }
+    if (numCQ == 2)
+    {
+        err = run_test_with_two_queue(context, cmd_queue1, cmd_queue2,
+                                      kernel_unsigned, kernel_signed,
+                                      kernel_float, vkDevice);
+    }
+    else
+    {
+        err = run_test_with_one_queue(context, cmd_queue1, kernel_unsigned,
+                                      kernel_signed, kernel_float, vkDevice);
+    }
+    for (int i = 0; i < num_kernels; i++)
+    {
+        if (kernel_float[i])
+        {
+            clReleaseKernel(kernel_float[i]);
+        }
+        if (kernel_unsigned[i])
+        {
+            clReleaseKernel(kernel_unsigned[i]);
+        }
+        if (kernel_signed[i])
+        {
+            clReleaseKernel(kernel_signed[i]);
+        }
+    }
+    for (int i = 0; i < num_kernel_types; i++)
+    {
+        if (program[i])
+        {
+            clReleaseProgram(program[i]);
+        }
+    }
+    if (cmd_queue1) clReleaseCommandQueue(cmd_queue1);
+    if (cmd_queue2) clReleaseCommandQueue(cmd_queue2);
+    if (context) clReleaseContext(context);
+    if (extensions) free(extensions);
+    if (devices) free(devices);
+    return err;
diff --git a/test_conformance/vulkan/test_vulkan_platform_device_info.cpp b/test_conformance/vulkan/test_vulkan_platform_device_info.cpp
new file mode 100644
index 0000000..12f373b
--- /dev/null
+++ b/test_conformance/vulkan/test_vulkan_platform_device_info.cpp
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include "harness/testHarness.h"
+#include <iostream>
+#include <string>
+typedef struct
+    cl_uint info;
+    const char *name;
+} _info;
+_info platform_info_table[] = {
+#define STRING(x)                                                              \
+    {                                                                          \
+        x, #x                                                                  \
+    }
+#undef STRING
+_info device_info_table[] = {
+#define STRING(x)                                                              \
+    {                                                                          \
+        x, #x                                                                  \
+    }
+#undef STRING
+int test_platform_info(cl_device_id deviceID, cl_context _context,
+                       cl_command_queue _queue, int num_elements)
+    cl_uint num_platforms;
+    cl_uint i, j;
+    cl_platform_id *platforms;
+    cl_int errNum;
+    cl_uint *handle_type;
+    size_t handle_type_size = 0;
+    cl_uint num_handles = 0;
+    // get total # of platforms
+    errNum = clGetPlatformIDs(0, NULL, &num_platforms);
+    test_error(errNum, "clGetPlatformIDs (getting count) failed");
+    platforms =
+        (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id));
+    if (!platforms)
+    {
+        printf("error allocating memory\n");
+        exit(1);
+    }
+    log_info("%d platforms available\n", num_platforms);
+    errNum = clGetPlatformIDs(num_platforms, platforms, NULL);
+    test_error(errNum, "clGetPlatformIDs (getting IDs) failed");
+    for (i = 0; i < num_platforms; i++)
+    {
+        log_info("Platform%d (id %lu) info:\n", i, (unsigned long)platforms[i]);
+        for (j = 0;
+             j < sizeof(platform_info_table) / sizeof(platform_info_table[0]);
+             j++)
+        {
+            errNum =
+                clGetPlatformInfo(platforms[i], platform_info_table[j].info, 0,
+                                  NULL, &handle_type_size);
+            test_error(errNum, "clGetPlatformInfo failed");
+            num_handles = handle_type_size / sizeof(cl_uint);
+            handle_type = (cl_uint *)malloc(handle_type_size);
+            errNum =
+                clGetPlatformInfo(platforms[i], platform_info_table[j].info,
+                                  handle_type_size, handle_type, NULL);
+            test_error(errNum, "clGetPlatformInfo failed");
+            log_info("%s: \n", platform_info_table[j].name);
+            while (num_handles--)
+            {
+                log_info("%x \n", handle_type[num_handles]);
+            }
+            if (handle_type)
+            {
+                free(handle_type);
+            }
+        }
+    }
+    if (platforms)
+    {
+        free(platforms);
+    }
+    return TEST_PASS;
+int test_device_info(cl_device_id deviceID, cl_context _context,
+                     cl_command_queue _queue, int num_elements)
+    cl_uint j;
+    cl_uint *handle_type;
+    size_t handle_type_size = 0;
+    cl_uint num_handles = 0;
+    cl_int errNum = CL_SUCCESS;
+    for (j = 0; j < sizeof(device_info_table) / sizeof(device_info_table[0]);
+         j++)
+    {
+        errNum = clGetDeviceInfo(deviceID, device_info_table[j].info, 0, NULL,
+                                 &handle_type_size);
+        test_error(errNum, "clGetDeviceInfo failed");
+        num_handles = handle_type_size / sizeof(cl_uint);
+        handle_type = (cl_uint *)malloc(handle_type_size);
+        errNum = clGetDeviceInfo(deviceID, device_info_table[j].info,
+                                 handle_type_size, handle_type, NULL);
+        test_error(errNum, "clGetDeviceInfo failed");
+        log_info("%s: \n", device_info_table[j].name);
+        while (num_handles--)
+        {
+            log_info("%x \n", handle_type[num_handles]);
+        }
+        if (handle_type)
+        {
+            free(handle_type);
+        }
+    }
+    return TEST_PASS;
diff --git a/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp
new file mode 100644
index 0000000..9d9a660
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp
@@ -0,0 +1,853 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <CL/cl_ext.h>
+#include "opencl_vulkan_wrapper.hpp"
+#include "vulkan_wrapper.hpp"
+#include "harness/errorHelpers.h"
+#include "harness/deviceInfo.h"
+#include <assert.h>
+#include <iostream>
+#include <stdexcept>
+#define ASSERT(x) assert((x))
+#define GB(x) ((unsigned long long)(x) << 30)
+pfnclCreateSemaphoreWithPropertiesKHR clCreateSemaphoreWithPropertiesKHRptr;
+pfnclEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHRptr;
+pfnclEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHRptr;
+    clEnqueueAcquireExternalMemObjectsKHRptr;
+    clEnqueueReleaseExternalMemObjectsKHRptr;
+pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr;
+void init_cl_vk_ext(cl_platform_id opencl_platform)
+    clEnqueueWaitSemaphoresKHRptr =
+        (pfnclEnqueueWaitSemaphoresKHR)clGetExtensionFunctionAddressForPlatform(
+            opencl_platform, "clEnqueueWaitSemaphoresKHR");
+    if (NULL == clEnqueueWaitSemaphoresKHRptr)
+    {
+        throw std::runtime_error("Failed to get the function pointer of "
+                                 "clEnqueueWaitSemaphoresKHRptr!");
+    }
+    clEnqueueSignalSemaphoresKHRptr = (pfnclEnqueueSignalSemaphoresKHR)
+        clGetExtensionFunctionAddressForPlatform(
+            opencl_platform, "clEnqueueSignalSemaphoresKHR");
+    if (NULL == clEnqueueSignalSemaphoresKHRptr)
+    {
+        throw std::runtime_error("Failed to get the function pointer of "
+                                 "clEnqueueSignalSemaphoresKHRptr!");
+    }
+    clReleaseSemaphoreKHRptr =
+        (pfnclReleaseSemaphoreKHR)clGetExtensionFunctionAddressForPlatform(
+            opencl_platform, "clReleaseSemaphoreKHR");
+    if (NULL == clReleaseSemaphoreKHRptr)
+    {
+        throw std::runtime_error("Failed to get the function pointer of "
+                                 "clReleaseSemaphoreKHRptr!");
+    }
+    clCreateSemaphoreWithPropertiesKHRptr =
+        (pfnclCreateSemaphoreWithPropertiesKHR)
+            clGetExtensionFunctionAddressForPlatform(
+                opencl_platform, "clCreateSemaphoreWithPropertiesKHR");
+    if (NULL == clCreateSemaphoreWithPropertiesKHRptr)
+    {
+        throw std::runtime_error("Failed to get the function pointer of "
+                                 "clCreateSemaphoreWithPropertiesKHRptr!");
+    }
+cl_int setMaxImageDimensions(cl_device_id deviceID, size_t &max_width,
+                             size_t &max_height)
+    cl_int result = CL_SUCCESS;
+    cl_ulong val;
+    size_t paramSize;
+    result = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                             sizeof(cl_ulong), &val, &paramSize);
+    if (result != CL_SUCCESS)
+    {
+        return result;
+    }
+    if (val < GB(4))
+    {
+        max_width = 256;
+        max_height = 256;
+    }
+    else if (val < GB(8))
+    {
+        max_width = 512;
+        max_height = 256;
+    }
+    else
+    {
+        max_width = 1024;
+        max_height = 512;
+    }
+    return result;
+cl_int getCLFormatFromVkFormat(VkFormat vkFormat,
+                               cl_image_format *clImageFormat)
+    cl_int result = CL_SUCCESS;
+    switch (vkFormat)
+    {
+        case VK_FORMAT_R8G8B8A8_UNORM:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case VK_FORMAT_B8G8R8A8_UNORM:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case VK_FORMAT_R16G16B16A16_UNORM:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case VK_FORMAT_R8G8B8A8_SINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+            break;
+        case VK_FORMAT_R16G16B16A16_SINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+            break;
+        case VK_FORMAT_R32G32B32A32_SINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+            break;
+        case VK_FORMAT_R8G8B8A8_UINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case VK_FORMAT_R16G16B16A16_UINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16;
+            break;
+        case VK_FORMAT_R32G32B32A32_UINT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+            break;
+        case VK_FORMAT_R16G16B16A16_SFLOAT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case VK_FORMAT_R32G32B32A32_SFLOAT:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_FLOAT;
+            break;
+        case VK_FORMAT_R8_SNORM:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT8;
+            break;
+        case VK_FORMAT_R16_SNORM:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT16;
+            break;
+        case VK_FORMAT_R8_UNORM:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case VK_FORMAT_R16_UNORM:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case VK_FORMAT_R8_SINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+            break;
+        case VK_FORMAT_R16_SINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+            break;
+        case VK_FORMAT_R32_SINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+            break;
+        case VK_FORMAT_R8_UINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case VK_FORMAT_R16_UINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16;
+            break;
+        case VK_FORMAT_R32_UINT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+            break;
+        case VK_FORMAT_R16_SFLOAT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case VK_FORMAT_R32_SFLOAT:
+            clImageFormat->image_channel_order = CL_R;
+            clImageFormat->image_channel_data_type = CL_FLOAT;
+            break;
+        case VK_FORMAT_R8G8_SNORM:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT8;
+            break;
+        case VK_FORMAT_R16G16_SNORM:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT16;
+            break;
+        case VK_FORMAT_R8G8_UNORM:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case VK_FORMAT_R16G16_UNORM:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case VK_FORMAT_R8G8_SINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+            break;
+        case VK_FORMAT_R16G16_SINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+            break;
+        case VK_FORMAT_R32G32_SINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+            break;
+        case VK_FORMAT_R8G8_UINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case VK_FORMAT_R16G16_UINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16;
+            break;
+        case VK_FORMAT_R32G32_UINT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+            break;
+        case VK_FORMAT_R16G16_SFLOAT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case VK_FORMAT_R32G32_SFLOAT:
+            clImageFormat->image_channel_order = CL_RG;
+            clImageFormat->image_channel_data_type = CL_FLOAT;
+            break;
+        case VK_FORMAT_R5G6B5_UNORM_PACK16:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNORM_SHORT_565;
+            break;
+        case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_UNORM_SHORT_555;
+            break;
+        case VK_FORMAT_R8G8B8A8_SNORM:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT8;
+            break;
+        case VK_FORMAT_R16G16B16A16_SNORM:
+            clImageFormat->image_channel_order = CL_RGBA;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT16;
+            break;
+        case VK_FORMAT_B8G8R8A8_SNORM:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_SNORM_INT8;
+            break;
+        case VK_FORMAT_B5G6R5_UNORM_PACK16:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_UNORM_SHORT_565;
+            break;
+        case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_UNORM_SHORT_555;
+            break;
+        case VK_FORMAT_B8G8R8A8_SINT:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+            break;
+        case VK_FORMAT_B8G8R8A8_UINT:
+            clImageFormat->image_channel_order = CL_BGRA;
+            clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case VK_FORMAT_A8B8G8R8_SNORM_PACK32: result = CL_INVALID_VALUE; break;
+        case VK_FORMAT_A8B8G8R8_UNORM_PACK32: result = CL_INVALID_VALUE; break;
+        case VK_FORMAT_A8B8G8R8_SINT_PACK32: result = CL_INVALID_VALUE; break;
+        case VK_FORMAT_A8B8G8R8_UINT_PACK32: result = CL_INVALID_VALUE; break;
+        default:
+            log_error("Unsupported format\n");
+            ASSERT(0);
+            break;
+    }
+    return result;
+cl_mem_object_type getImageTypeFromVk(VkImageType imageType)
+    cl_mem_object_type cl_image_type = CL_INVALID_VALUE;
+    switch (imageType)
+    {
+        case VK_IMAGE_TYPE_1D: cl_image_type = CL_MEM_OBJECT_IMAGE1D; break;
+        case VK_IMAGE_TYPE_2D: cl_image_type = CL_MEM_OBJECT_IMAGE2D; break;
+        case VK_IMAGE_TYPE_3D: cl_image_type = CL_MEM_OBJECT_IMAGE3D; break;
+        default: break;
+    }
+    return cl_image_type;
+size_t GetElementNBytes(const cl_image_format *format)
+    size_t result;
+    switch (format->image_channel_order)
+    {
+        case CL_R:
+        case CL_A:
+        case CL_INTENSITY:
+        case CL_LUMINANCE:
+        case CL_DEPTH: result = 1; break;
+        case CL_RG:
+        case CL_RA: result = 2; break;
+        case CL_RGB: result = 3; break;
+        case CL_RGBA:
+        case CL_ARGB:
+        case CL_BGRA:
+        case CL_sRGBA: result = 4; break;
+        default: result = 0; break;
+    }
+    switch (format->image_channel_data_type)
+    {
+        case CL_SNORM_INT8:
+        case CL_UNORM_INT8:
+        case CL_SIGNED_INT8:
+        case CL_UNSIGNED_INT8:
+            // result *= 1;
+            break;
+        case CL_SNORM_INT16:
+        case CL_UNORM_INT16:
+        case CL_SIGNED_INT16:
+        case CL_UNSIGNED_INT16:
+        case CL_HALF_FLOAT: result *= 2; break;
+        case CL_SIGNED_INT32:
+        case CL_UNSIGNED_INT32:
+        case CL_FLOAT: result *= 4; break;
+        case CL_UNORM_SHORT_565:
+        case CL_UNORM_SHORT_555:
+            if (result == 3)
+            {
+                result = 2;
+            }
+            else
+            {
+                result = 0;
+            }
+            break;
+        case CL_UNORM_INT_101010:
+            if (result == 3)
+            {
+                result = 4;
+            }
+            else
+            {
+                result = 0;
+            }
+            break;
+        default: result = 0; break;
+    }
+    return result;
+cl_int get2DImageDimensions(const VkImageCreateInfo *VulkanImageCreateInfo,
+                            cl_image_format *img_fmt, size_t totalImageSize,
+                            size_t &width, size_t &height)
+    cl_int result = CL_SUCCESS;
+    if (totalImageSize == 0)
+    {
+        result = CL_INVALID_VALUE;
+    }
+    size_t element_size = GetElementNBytes(img_fmt);
+    size_t row_pitch = element_size * VulkanImageCreateInfo->extent.width;
+    row_pitch = row_pitch % 64 == 0 ? row_pitch : ((row_pitch / 64) + 1) * 64;
+    width = row_pitch / element_size;
+    height = totalImageSize / row_pitch;
+    return result;
+getCLImageInfoFromVkImageInfo(const VkImageCreateInfo *VulkanImageCreateInfo,
+                              size_t totalImageSize, cl_image_format *img_fmt,
+                              cl_image_desc *img_desc)
+    cl_int result = CL_SUCCESS;
+    cl_image_format clImgFormat = { 0 };
+    result =
+        getCLFormatFromVkFormat(VulkanImageCreateInfo->format, &clImgFormat);
+    if (CL_SUCCESS != result)
+    {
+        return result;
+    }
+    memcpy(img_fmt, &clImgFormat, sizeof(cl_image_format));
+    img_desc->image_type = getImageTypeFromVk(VulkanImageCreateInfo->imageType);
+    if (CL_INVALID_VALUE == img_desc->image_type)
+    {
+        return CL_INVALID_VALUE;
+    }
+    result =
+        get2DImageDimensions(VulkanImageCreateInfo, img_fmt, totalImageSize,
+                             img_desc->image_width, img_desc->image_height);
+    if (CL_SUCCESS != result)
+    {
+        throw std::runtime_error("get2DImageDimensions failed!!!");
+    }
+    img_desc->image_depth = 0; // VulkanImageCreateInfo->extent.depth;
+    img_desc->image_array_size = 0;
+    img_desc->image_row_pitch = 0; // Row pitch set to zero as host_ptr is NULL
+    img_desc->image_slice_pitch =
+        img_desc->image_row_pitch * img_desc->image_height;
+    img_desc->num_mip_levels = 1;
+    img_desc->num_samples = 0;
+    img_desc->buffer = NULL;
+    return result;
+cl_int check_external_memory_handle_type(
+    cl_device_id deviceID,
+    cl_external_memory_handle_type_khr requiredHandleType)
+    unsigned int i;
+    cl_external_memory_handle_type_khr *handle_type;
+    size_t handle_type_size = 0;
+    cl_int errNum = CL_SUCCESS;
+    errNum = clGetDeviceInfo(deviceID,
+                             0, NULL, &handle_type_size);
+    handle_type =
+        (cl_external_memory_handle_type_khr *)malloc(handle_type_size);
+    errNum = clGetDeviceInfo(deviceID,
+                             handle_type_size, handle_type, NULL);
+    test_error(
+        errNum,
+    for (i = 0; i < handle_type_size; i++)
+    {
+        if (requiredHandleType == handle_type[i])
+        {
+            return CL_SUCCESS;
+        }
+    }
+    log_error("cl_khr_external_memory extension is missing support for %d\n",
+              requiredHandleType);
+    return CL_INVALID_VALUE;
+cl_int check_external_semaphore_handle_type(
+    cl_device_id deviceID,
+    cl_external_semaphore_handle_type_khr requiredHandleType)
+    unsigned int i;
+    cl_external_semaphore_handle_type_khr *handle_type;
+    size_t handle_type_size = 0;
+    cl_int errNum = CL_SUCCESS;
+    errNum =
+                        0, NULL, &handle_type_size);
+    handle_type =
+        (cl_external_semaphore_handle_type_khr *)malloc(handle_type_size);
+    errNum =
+                        handle_type_size, handle_type, NULL);
+    test_error(
+        errNum,
+    for (i = 0; i < handle_type_size; i++)
+    {
+        if (requiredHandleType == handle_type[i])
+        {
+            return CL_SUCCESS;
+        }
+    }
+    log_error("cl_khr_external_semaphore extension is missing support for %d\n",
+              requiredHandleType);
+    return CL_INVALID_VALUE;
+clExternalMemory::clExternalMemory() {}
+clExternalMemory::clExternalMemory(const clExternalMemory &externalMemory)
+    : m_externalMemory(externalMemory.m_externalMemory)
+    const VulkanDeviceMemory *deviceMemory,
+    VulkanExternalMemoryHandleType externalMemoryHandleType, uint64_t offset,
+    uint64_t size, cl_context context, cl_device_id deviceId)
+    int err = 0;
+    m_externalMemory = NULL;
+    cl_device_id devList[] = { deviceId, NULL };
+    std::vector<cl_mem_properties> extMemProperties;
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_win32"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_memory_win32 extension\n");
+    }
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_memory_opaque_fd "
+            "extension \n");
+    }
+    switch (externalMemoryHandleType)
+    {
+#ifdef _WIN32
+            ASSERT(0);
+            log_info("Opaque file descriptors are not supported on Windows\n");
+            fd = (int)deviceMemory->getHandle(externalMemoryHandleType);
+            err = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties.push_back((cl_mem_properties)fd);
+            break;
+#ifndef _WIN32
+            ASSERT(0);
+            log_info(" Opaque NT handles are only supported on Windows\n");
+            handle = deviceMemory->getHandle(externalMemoryHandleType);
+            err = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+#ifndef _WIN32
+            ASSERT(0);
+            log_info("Opaque D3DKMT handles are only supported on Windows\n");
+            handle = deviceMemory->getHandle(externalMemoryHandleType);
+            err = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            extMemProperties.push_back(
+                (cl_mem_properties)
+            extMemProperties.push_back((cl_mem_properties)handle);
+            break;
+        default:
+            ASSERT(0);
+            log_error("Unsupported external memory handle type\n");
+            break;
+    }
+    if (CL_SUCCESS != err)
+    {
+        throw std::runtime_error("Unsupported external memory type\n ");
+    }
+    extMemProperties.push_back((cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR);
+    extMemProperties.push_back((cl_mem_properties)devList[0]);
+    extMemProperties.push_back(
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR);
+    extMemProperties.push_back(0);
+    m_externalMemory = clCreateBufferWithProperties(
+        context,, 1, size, NULL, &err);
+    if (CL_SUCCESS != err)
+    {
+        log_error("clCreateBufferWithProperties failed with %d\n", err);
+        throw std::runtime_error("clCreateBufferWithProperties failed ");
+    }
+    const VulkanDeviceMemory &deviceMemory,
+    VulkanExternalMemoryHandleType externalMemoryHandleType, cl_context context,
+    size_t totalImageMemSize, size_t imageWidth, size_t imageHeight,
+    size_t totalSize, const VulkanImage2D &image2D, cl_device_id deviceId)
+    cl_int errcode_ret = 0;
+    std::vector<cl_mem_properties> extMemProperties1;
+    cl_device_id devList[] = { deviceId, NULL };
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_win32"))
+    {
+        throw std::runtime_error("Device does not support "
+                                 "cl_khr_external_memory_win32 extension \n");
+    }
+#elif !defined(__APPLE__)
+    if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_memory_opaque_fd "
+            "extension\n");
+    }
+    switch (externalMemoryHandleType)
+    {
+#ifdef _WIN32
+            log_info("Opaque NT handles are only supported on Windows\n");
+            handle = deviceMemory.getHandle(externalMemoryHandleType);
+            errcode_ret = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties1.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR);
+            extMemProperties1.push_back((cl_mem_properties)handle);
+            break;
+            log_info("Opaque D3DKMT handles are only supported on Windows\n");
+            handle = deviceMemory.getHandle(externalMemoryHandleType);
+            errcode_ret = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            extMemProperties1.push_back(
+                (cl_mem_properties)
+            extMemProperties1.push_back((cl_mem_properties)handle);
+            break;
+#elif !defined(__APPLE__)
+            log_info(" Opaque file descriptors are not supported on Windows\n");
+            fd = (int)deviceMemory.getHandle(externalMemoryHandleType);
+            errcode_ret = check_external_memory_handle_type(
+                devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties1.push_back(
+                (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
+            extMemProperties1.push_back((cl_mem_properties)fd);
+            break;
+        default:
+            ASSERT(0);
+            log_error("Unsupported external memory handle type\n");
+            break;
+    }
+    if (CL_SUCCESS != errcode_ret)
+    {
+        throw std::runtime_error("Unsupported external memory type\n ");
+    }
+    // Set cl_image_desc
+    size_t clImageFormatSize;
+    cl_image_desc image_desc;
+    memset(&image_desc, 0x0, sizeof(cl_image_desc));
+    cl_image_format img_format = { 0 };
+    const VkImageCreateInfo VulkanImageCreateInfo =
+        image2D.getVkImageCreateInfo();
+    errcode_ret = getCLImageInfoFromVkImageInfo(
+        &VulkanImageCreateInfo, image2D.getSize(), &img_format, &image_desc);
+    if (CL_SUCCESS != errcode_ret)
+    {
+        throw std::runtime_error("getCLImageInfoFromVkImageInfo failed!!!");
+    }
+    extMemProperties1.push_back((cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR);
+    extMemProperties1.push_back((cl_mem_properties)devList[0]);
+    extMemProperties1.push_back(
+        (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR);
+    extMemProperties1.push_back(0);
+    m_externalMemory = clCreateImageWithProperties(
+        context,, CL_MEM_READ_WRITE, &img_format,
+        &image_desc, NULL, &errcode_ret);
+    if (CL_SUCCESS != errcode_ret)
+    {
+        throw std::runtime_error("clCreateImageWithProperties failed!!!");
+    }
+cl_mem clExternalMemory::getExternalMemoryBuffer() { return m_externalMemory; }
+cl_mem clExternalMemoryImage::getExternalMemoryImage()
+    return m_externalMemory;
+    clReleaseMemObject(m_externalMemory);
+clExternalMemory::~clExternalMemory() { clReleaseMemObject(m_externalMemory); }
+clExternalMemoryImage::clExternalMemoryImage() {}
+// clExternalSemaphore implementation //
+    const clExternalSemaphore &externalSemaphore)
+    : m_externalSemaphore(externalSemaphore.m_externalSemaphore)
+    const VulkanSemaphore &semaphore, cl_context context,
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+    cl_device_id deviceId)
+    cl_int err = 0;
+    cl_device_id devList[] = { deviceId, NULL };
+#ifdef _WIN32
+    if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32"))
+    {
+        throw std::runtime_error("Device does not support "
+                                 "cl_khr_external_semaphore_win32 extension\n");
+    }
+#elif !defined(__APPLE__)
+    if (!is_extension_available(devList[0],
+                                "cl_khr_external_semaphore_opaque_fd"))
+    {
+        throw std::runtime_error(
+            "Device does not support cl_khr_external_semaphore_opaque_fd "
+            "extension \n");
+    }
+    std::vector<cl_semaphore_properties_khr> sema_props{
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR,
+    };
+    switch (externalSemaphoreHandleType)
+    {
+#ifdef _WIN32
+            ASSERT(0);
+            log_info(" Opaque file descriptors are not supported on Windows\n");
+            fd = (int)semaphore.getHandle(externalSemaphoreHandleType);
+            err = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
+            sema_props.push_back(
+                (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
+            sema_props.push_back((cl_semaphore_properties_khr)fd);
+            break;
+#ifndef _WIN32
+            ASSERT(0);
+            log_info(" Opaque NT handles are only supported on Windows\n");
+            handle = semaphore.getName().size()
+                ? NULL
+                : semaphore.getHandle(externalSemaphoreHandleType);
+            err = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR);
+            sema_props.push_back((cl_semaphore_properties_khr)
+                                     CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR);
+            sema_props.push_back((cl_semaphore_properties_khr)handle);
+            break;
+#ifndef _WIN32
+            ASSERT(0);
+            log_info(" Opaque D3DKMT handles are only supported on Windows\n");
+            handle = semaphore.getHandle(externalSemaphoreHandleType);
+            err = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            sema_props.push_back((cl_semaphore_properties_khr)
+                                     CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR);
+            sema_props.push_back((cl_semaphore_properties_khr)handle);
+            break;
+        default:
+            ASSERT(0);
+            log_error("Unsupported external memory handle type\n");
+            break;
+    }
+    if (CL_SUCCESS != err)
+    {
+        throw std::runtime_error(
+            "Unsupported external sempahore handle type\n ");
+    }
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR);
+    sema_props.push_back((cl_semaphore_properties_khr)devList[0]);
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR);
+    sema_props.push_back(0);
+    m_externalSemaphore =
+        clCreateSemaphoreWithPropertiesKHRptr(context,, &err);
+    if (CL_SUCCESS != err)
+    {
+        log_error("clCreateSemaphoreWithPropertiesKHRptr failed with %d\n",
+                  err);
+        throw std::runtime_error(
+            "clCreateSemaphoreWithPropertiesKHRptr failed! ");
+    }
+    cl_int err = clReleaseSemaphoreKHRptr(m_externalSemaphore);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clReleaseSemaphoreKHR failed!");
+    }
+void clExternalSemaphore::signal(cl_command_queue cmd_queue)
+    clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore, NULL, 0,
+                                    NULL, NULL);
+void clExternalSemaphore::wait(cl_command_queue cmd_queue)
+    clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore, NULL, 0,
+                                  NULL, NULL);
diff --git a/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp
new file mode 100644
index 0000000..d9f8dcc
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _opencl_vulkan_wrapper_hpp_
+#define _opencl_vulkan_wrapper_hpp_
+#include "vulkan_wrapper.hpp"
+#if !defined(__APPLE__)
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_ext.h>
+typedef cl_semaphore_khr (*pfnclCreateSemaphoreWithPropertiesKHR)(
+    cl_context context, cl_semaphore_properties_khr *sema_props,
+    cl_int *errcode_ret);
+typedef cl_int (*pfnclEnqueueWaitSemaphoresKHR)(
+    cl_command_queue command_queue, cl_uint num_semaphores,
+    const cl_semaphore_khr *sema_list,
+    const cl_semaphore_payload_khr *sema_payload_list,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event);
+typedef cl_int (*pfnclEnqueueSignalSemaphoresKHR)(
+    cl_command_queue command_queue, cl_uint num_semaphores,
+    const cl_semaphore_khr *sema_list,
+    const cl_semaphore_payload_khr *sema_payload_list,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event);
+typedef cl_int (*pfnclEnqueueAcquireExternalMemObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_mem_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+typedef cl_int (*pfnclEnqueueReleaseExternalMemObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_mem_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+typedef cl_int (*pfnclReleaseSemaphoreKHR)(cl_semaphore_khr sema_object);
+extern pfnclCreateSemaphoreWithPropertiesKHR
+    clCreateSemaphoreWithPropertiesKHRptr;
+extern pfnclEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHRptr;
+extern pfnclEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHRptr;
+extern pfnclEnqueueAcquireExternalMemObjectsKHR
+    clEnqueueAcquireExternalMemObjectsKHRptr;
+extern pfnclEnqueueReleaseExternalMemObjectsKHR
+    clEnqueueReleaseExternalMemObjectsKHRptr;
+extern pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr;
+cl_int getCLImageInfoFromVkImageInfo(const VkImageCreateInfo *, size_t,
+                                     cl_image_format *, cl_image_desc *);
+cl_int check_external_memory_handle_type(
+    cl_device_id deviceID,
+    cl_external_memory_handle_type_khr requiredHandleType);
+cl_int check_external_semaphore_handle_type(
+    cl_device_id deviceID,
+    cl_external_semaphore_handle_type_khr requiredHandleType);
+cl_int setMaxImageDimensions(cl_device_id deviceID, size_t &width,
+                             size_t &height);
+class clExternalMemory {
+    cl_mem m_externalMemory;
+    int fd;
+    void *handle;
+    clExternalMemory(const clExternalMemory &externalMemory);
+    clExternalMemory();
+    clExternalMemory(const VulkanDeviceMemory *deviceMemory,
+                     VulkanExternalMemoryHandleType externalMemoryHandleType,
+                     uint64_t offset, uint64_t size, cl_context context,
+                     cl_device_id deviceId);
+    virtual ~clExternalMemory();
+    cl_mem getExternalMemoryBuffer();
+class clExternalMemoryImage {
+    cl_mem m_externalMemory;
+    int fd;
+    void *handle;
+    cl_command_queue cmd_queue;
+    clExternalMemoryImage();
+    clExternalMemoryImage(
+        const VulkanDeviceMemory &deviceMemory,
+        VulkanExternalMemoryHandleType externalMemoryHandleType,
+        cl_context context, size_t totalImageMemSize, size_t imageWidth,
+        size_t imageHeight, size_t totalSize, const VulkanImage2D &image2D,
+        cl_device_id deviceId);
+    virtual ~clExternalMemoryImage();
+    cl_mem getExternalMemoryImage();
+class clExternalSemaphore {
+    cl_semaphore_khr m_externalSemaphore;
+    int fd;
+    void *handle;
+    clExternalSemaphore(const clExternalSemaphore &externalSemaphore);
+    clExternalSemaphore(
+        const VulkanSemaphore &deviceSemaphore, cl_context context,
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+        cl_device_id deviceId);
+    virtual ~clExternalSemaphore();
+    void signal(cl_command_queue command_queue);
+    void wait(cl_command_queue command_queue);
+    // operator openclExternalSemaphore_t() const;
+extern void init_cl_vk_ext(cl_platform_id);
+#endif // _opencl_vulkan_wrapper_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp
new file mode 100644
index 0000000..017aefd
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp
@@ -0,0 +1,195 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _vulkan_api_list_hpp_
+#define _vulkan_api_list_hpp_
+#define VK_FUNC_LIST                                                           \
+    VK_FUNC_DECL(vkEnumerateInstanceVersion)                                   \
+    VK_FUNC_DECL(vkEnumerateInstanceExtensionProperties)                       \
+    VK_FUNC_DECL(vkEnumerateInstanceLayerProperties)                           \
+    VK_FUNC_DECL(vkCreateInstance)                                             \
+    VK_FUNC_DECL(vkGetInstanceProcAddr)                                        \
+    VK_FUNC_DECL(vkGetDeviceProcAddr)                                          \
+    VK_FUNC_DECL(vkEnumeratePhysicalDevices)                                   \
+    VK_FUNC_DECL(vkGetPhysicalDeviceProperties)                                \
+    VK_FUNC_DECL(vkCreateDevice)                                               \
+    VK_FUNC_DECL(vkDestroyDevice)                                              \
+    VK_FUNC_DECL(vkGetDeviceQueue)                                             \
+    VK_FUNC_DECL(vkQueueWaitIdle)                                              \
+    VK_FUNC_DECL(vkCreateDescriptorSetLayout)                                  \
+    VK_FUNC_DECL(vkCreatePipelineLayout)                                       \
+    VK_FUNC_DECL(vkCreateShaderModule)                                         \
+    VK_FUNC_DECL(vkCreateComputePipelines)                                     \
+    VK_FUNC_DECL(vkCreateDescriptorPool)                                       \
+    VK_FUNC_DECL(vkAllocateDescriptorSets)                                     \
+    VK_FUNC_DECL(vkFreeDescriptorSets)                                         \
+    VK_FUNC_DECL(vkAllocateCommandBuffers)                                     \
+    VK_FUNC_DECL(vkBeginCommandBuffer)                                         \
+    VK_FUNC_DECL(vkCmdBindPipeline)                                            \
+    VK_FUNC_DECL(vkCmdBindDescriptorSets)                                      \
+    VK_FUNC_DECL(vkCmdPipelineBarrier)                                         \
+    VK_FUNC_DECL(vkCmdDispatch)                                                \
+    VK_FUNC_DECL(vkCmdFillBuffer)                                              \
+    VK_FUNC_DECL(vkCmdCopyBuffer)                                              \
+    VK_FUNC_DECL(vkCmdUpdateBuffer)                                            \
+    VK_FUNC_DECL(vkCmdCopyBufferToImage)                                       \
+    VK_FUNC_DECL(vkCmdCopyImageToBuffer)                                       \
+    VK_FUNC_DECL(vkEndCommandBuffer)                                           \
+    VK_FUNC_DECL(vkCreateBuffer)                                               \
+    VK_FUNC_DECL(vkCreateImageView)                                            \
+    VK_FUNC_DECL(vkAllocateMemory)                                             \
+    VK_FUNC_DECL(vkMapMemory)                                                  \
+    VK_FUNC_DECL(vkBindBufferMemory)                                           \
+    VK_FUNC_DECL(vkBindImageMemory)                                            \
+    VK_FUNC_DECL(vkUnmapMemory)                                                \
+    VK_FUNC_DECL(vkFreeMemory)                                                 \
+    VK_FUNC_DECL(vkCreateCommandPool)                                          \
+    VK_FUNC_DECL(vkResetCommandPool)                                           \
+    VK_FUNC_DECL(vkDestroyCommandPool)                                         \
+    VK_FUNC_DECL(vkResetCommandBuffer)                                         \
+    VK_FUNC_DECL(vkFreeCommandBuffers)                                         \
+    VK_FUNC_DECL(vkQueueSubmit)                                                \
+    VK_FUNC_DECL(vkCmdExecuteCommands)                                         \
+    VK_FUNC_DECL(vkCreateFence)                                                \
+    VK_FUNC_DECL(vkDestroyFence)                                               \
+    VK_FUNC_DECL(vkGetFenceStatus)                                             \
+    VK_FUNC_DECL(vkResetFences)                                                \
+    VK_FUNC_DECL(vkWaitForFences)                                              \
+    VK_FUNC_DECL(vkCreateSemaphore)                                            \
+    VK_FUNC_DECL(vkDestroySemaphore)                                           \
+    VK_FUNC_DECL(vkCreateEvent)                                                \
+    VK_FUNC_DECL(vkDestroyImageView)                                           \
+    VK_FUNC_DECL(vkCreateImage)                                                \
+    VK_FUNC_DECL(vkGetImageMemoryRequirements)                                 \
+    VK_FUNC_DECL(vkDestroyImage)                                               \
+    VK_FUNC_DECL(vkDestroyBuffer)                                              \
+    VK_FUNC_DECL(vkDestroyPipeline)                                            \
+    VK_FUNC_DECL(vkDestroyShaderModule)                                        \
+    VK_FUNC_DECL(vkGetPhysicalDeviceMemoryProperties)                          \
+    VK_FUNC_DECL(vkDestroyInstance)                                            \
+    VK_FUNC_DECL(vkUpdateDescriptorSets)                                       \
+    VK_FUNC_DECL(vkDestroyDescriptorPool)                                      \
+    VK_FUNC_DECL(vkDestroyPipelineLayout)                                      \
+    VK_FUNC_DECL(vkDestroyDescriptorSetLayout)                                 \
+    VK_FUNC_DECL(vkGetPhysicalDeviceQueueFamilyProperties)                     \
+    VK_FUNC_DECL(vkGetPhysicalDeviceFeatures)                                  \
+    VK_FUNC_DECL(vkGetPhysicalDeviceProperties2KHR)                            \
+    VK_FUNC_DECL(vkGetBufferMemoryRequirements)                                \
+    VK_FUNC_DECL(vkGetMemoryFdKHR)                                             \
+    VK_FUNC_DECL(vkGetSemaphoreFdKHR)                                          \
+    VK_FUNC_DECL(vkEnumeratePhysicalDeviceGroups)                              \
+    VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceCapabilitiesKHR)                    \
+    VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceFormatsKHR)                         \
+    VK_FUNC_DECL(vkGetPhysicalDeviceSurfacePresentModesKHR)                    \
+    VK_FUNC_DECL(vkEnumerateDeviceExtensionProperties)                         \
+    VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceSupportKHR)
+#define VK_WINDOWS_FUNC_LIST                                                   \
+    VK_FUNC_DECL(vkGetMemoryWin32HandleKHR)                                    \
+    VK_FUNC_DECL(vkGetSemaphoreWin32HandleKHR)
+#define vkEnumerateInstanceVersion _vkEnumerateInstanceVersion
+#define vkEnumerateInstanceExtensionProperties                                 \
+    _vkEnumerateInstanceExtensionProperties
+#define vkEnumerateInstanceLayerProperties _vkEnumerateInstanceLayerProperties
+#define vkCreateInstance _vkCreateInstance
+#define vkGetInstanceProcAddr _vkGetInstanceProcAddr
+#define vkGetDeviceProcAddr _vkGetDeviceProcAddr
+#define vkEnumeratePhysicalDevices _vkEnumeratePhysicalDevices
+#define vkGetPhysicalDeviceProperties _vkGetPhysicalDeviceProperties
+#define vkCreateDevice _vkCreateDevice
+#define vkDestroyDevice _vkDestroyDevice
+#define vkGetDeviceQueue _vkGetDeviceQueue
+#define vkQueueWaitIdle _vkQueueWaitIdle
+#define vkCreateDescriptorSetLayout _vkCreateDescriptorSetLayout
+#define vkCreatePipelineLayout _vkCreatePipelineLayout
+#define vkCreateShaderModule _vkCreateShaderModule
+#define vkCreateComputePipelines _vkCreateComputePipelines
+#define vkCreateDescriptorPool _vkCreateDescriptorPool
+#define vkAllocateDescriptorSets _vkAllocateDescriptorSets
+#define vkFreeDescriptorSets _vkFreeDescriptorSets
+#define vkAllocateCommandBuffers _vkAllocateCommandBuffers
+#define vkBeginCommandBuffer _vkBeginCommandBuffer
+#define vkCmdBindPipeline _vkCmdBindPipeline
+#define vkCmdBindDescriptorSets _vkCmdBindDescriptorSets
+#define vkCmdPipelineBarrier _vkCmdPipelineBarrier
+#define vkCmdDispatch _vkCmdDispatch
+#define vkCmdFillBuffer _vkCmdFillBuffer
+#define vkCmdCopyBuffer _vkCmdCopyBuffer
+#define vkCmdUpdateBuffer _vkCmdUpdateBuffer
+#define vkCmdCopyBufferToImage _vkCmdCopyBufferToImage
+#define vkCmdCopyImageToBuffer _vkCmdCopyImageToBuffer
+#define vkEndCommandBuffer _vkEndCommandBuffer
+#define vkCreateBuffer _vkCreateBuffer
+#define vkCreateImageView _vkCreateImageView
+#define vkAllocateMemory _vkAllocateMemory
+#define vkMapMemory _vkMapMemory
+#define vkBindBufferMemory _vkBindBufferMemory
+#define vkBindImageMemory _vkBindImageMemory
+#define vkUnmapMemory _vkUnmapMemory
+#define vkFreeMemory _vkFreeMemory
+#define vkCreateCommandPool _vkCreateCommandPool
+#define vkResetCommandPool _vkResetCommandPool
+#define vkDestroyCommandPool _vkDestroyCommandPool
+#define vkResetCommandBuffer _vkResetCommandBuffer
+#define vkFreeCommandBuffers _vkFreeCommandBuffers
+#define vkQueueSubmit _vkQueueSubmit
+#define vkCmdExecuteCommands _vkCmdExecuteCommands
+#define vkCreateFence _vkCreateFence
+#define vkDestroyFence _vkDestroyFence
+#define vkGetFenceStatus _vkGetFenceStatus
+#define vkResetFences _vkResetFences
+#define vkWaitForFences _vkWaitForFences
+#define vkCreateSemaphore _vkCreateSemaphore
+#define vkDestroySemaphore _vkDestroySemaphore
+#define vkCreateEvent _vkCreateEvent
+#define vkDestroyImageView _vkDestroyImageView
+#define vkCreateImage _vkCreateImage
+#define vkGetImageMemoryRequirements _vkGetImageMemoryRequirements
+#define vkDestroyImage _vkDestroyImage
+#define vkDestroyBuffe _vkDestroyBuffer
+#define vkDestroyPipeline _vkDestroyPipeline
+#define vkDestroyShaderModule _vkDestroyShaderModule
+#define vkGetPhysicalDeviceMemoryProperties _vkGetPhysicalDeviceMemoryProperties
+#define vkDestroyInstance _vkDestroyInstance
+#define vkUpdateDescriptorSets _vkUpdateDescriptorSets
+#define vkDestroyDescriptorPool _vkDestroyDescriptorPool
+#define vkDestroyPipelineLayout _vkDestroyPipelineLayout
+#define vkDestroyDescriptorSetLayout _vkDestroyDescriptorSetLayout
+#define vkGetPhysicalDeviceQueueFamilyProperties                               \
+    _vkGetPhysicalDeviceQueueFamilyProperties
+#define vkGetPhysicalDeviceFeatures _vkGetPhysicalDeviceFeatures
+#define vkGetPhysicalDeviceProperties2KHR _vkGetPhysicalDeviceProperties2KHR
+#define vkGetBufferMemoryRequirements _vkGetBufferMemoryRequirements
+#define vkGetMemoryFdKHR _vkGetMemoryFdKHR
+#define vkGetSemaphoreFdKHR _vkGetSemaphoreFdKHR
+#define vkEnumeratePhysicalDeviceGroups _vkEnumeratePhysicalDeviceGroups
+#define vkGetPhysicalDeviceSurfaceCapabilitiesKHR                              \
+    _vkGetPhysicalDeviceSurfaceCapabilitiesKHR
+#define vkGetPhysicalDeviceSurfaceFormatsKHR                                   \
+    _vkGetPhysicalDeviceSurfaceFormatsKHR
+#define vkGetPhysicalDeviceSurfacePresentModesKHR                              \
+    _vkGetPhysicalDeviceSurfacePresentModesKHR
+#define vkEnumerateDeviceExtensionProperties                                   \
+    _vkEnumerateDeviceExtensionProperties
+#define vkGetPhysicalDeviceSurfaceSupportKHR                                   \
+    _vkGetPhysicalDeviceSurfaceSupportKHR
+#define vkGetMemoryWin32HandleKHR _vkGetMemoryWin32HandleKHR
+#define vkGetSemaphoreWin32HandleKHR _vkGetSemaphoreWin32HandleKHR
+#endif //_vulkan_api_list_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp
new file mode 100644
index 0000000..db9d168
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "vulkan_interop_common.hpp"
+uint32_t innerIterations(5);
+uint32_t perfIterations(100);
+uint32_t stressIterations(1000);
+size_t cpuThreadsPerGpu(3);
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp
new file mode 100644
index 0000000..18d84f0
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _vulkan_interop_common_hpp_
+#define _vulkan_interop_common_hpp_
+#include "vulkan_wrapper_types.hpp"
+#include "vulkan_wrapper.hpp"
+#include "vulkan_list_map.hpp"
+#include "vulkan_utility.hpp"
+#include "opencl_vulkan_wrapper.hpp"
+// Number of iterations for loops within tests (default value 5)
+extern unsigned int innerIterations;
+// Number of iterations for loops within perf tests (default value 100)
+extern unsigned int perfIterations;
+// Number of iterations for loops within stress tests (default value 1000)
+extern unsigned int stressIterations;
+// Number of CPU threads per GPU (default value 3)
+extern size_t cpuThreadsPerGpu;
+// Number of command queues (default value 1)
+extern unsigned int numCQ;
+// Enable Multi-import of vulkan device memory
+extern bool multiImport;
+// Enable Multi-import of vulkan device memory under different context
+extern bool multiCtx;
+// Enable additional debug info logging
+extern bool debug_trace;
+extern bool useSingleImageKernel;
+extern bool useDeviceLocal;
+extern bool disableNTHandleType;
+// Enable offset for multiImport of vulkan device memory
+extern bool enableOffset;
+extern bool non_dedicated;
+#endif // _vulkan_interop_common_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp
new file mode 100644
index 0000000..bdae5d2
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp
@@ -0,0 +1,424 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef _WIN32
+#define NOMINMAX
+#include "vulkan_list_map.hpp"
+#include "vulkan_utility.hpp"
+#include "vulkan_wrapper.hpp"
+// VulkanPhysicalDeviceList implementation //
+    const VulkanPhysicalDeviceList &physicalDeviceList)
+VulkanPhysicalDeviceList::VulkanPhysicalDeviceList() {}
+VulkanPhysicalDeviceList::~VulkanPhysicalDeviceList() {}
+// VulkanMemoryHeapList implementation //
+    const VulkanMemoryHeapList &memoryHeapList)
+VulkanMemoryHeapList::VulkanMemoryHeapList() {}
+VulkanMemoryHeapList::~VulkanMemoryHeapList() {}
+// VulkanMemoryTypeList implementation //
+    const VulkanMemoryTypeList &memoryTypeList)
+VulkanMemoryTypeList::VulkanMemoryTypeList() {}
+VulkanMemoryTypeList::~VulkanMemoryTypeList() {}
+// VulkanQueueFamilyList implementation //
+    const VulkanQueueFamilyList &queueFamilyList)
+VulkanQueueFamilyList::VulkanQueueFamilyList() {}
+VulkanQueueFamilyList::~VulkanQueueFamilyList() {}
+// VulkanQueueFamilyToQueueCountMap implementation //
+    const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap)
+    uint32_t numQueuesPerFamily)
+    uint32_t maxQueueFamilyCount = 0;
+    const VulkanPhysicalDeviceList &physicalDeviceList =
+        getVulkanInstance().getPhysicalDeviceList();
+    for (size_t pdIdx = 0; pdIdx < physicalDeviceList.size(); pdIdx++)
+    {
+        maxQueueFamilyCount = std::max(
+            maxQueueFamilyCount,
+            (uint32_t)physicalDeviceList[pdIdx].getQueueFamilyList().size());
+    }
+    for (uint32_t qfIdx = 0; qfIdx < maxQueueFamilyCount; qfIdx++)
+    {
+        insert(qfIdx, numQueuesPerFamily);
+    }
+VulkanQueueFamilyToQueueCountMap::~VulkanQueueFamilyToQueueCountMap() {}
+// VulkanQueueFamilyToQueueListMap implementation //
+    const VulkanQueueFamilyToQueueListMap &queueFamilyToQueueMap)
+VulkanQueueFamilyToQueueListMap::VulkanQueueFamilyToQueueListMap() {}
+VulkanQueueFamilyToQueueListMap::~VulkanQueueFamilyToQueueListMap() {}
+void VulkanQueueFamilyToQueueListMap::insert(uint32_t key,
+                                             VulkanQueueList &queueList)
+    m_map.insert(std::pair<uint32_t, std::reference_wrapper<VulkanQueueList>>(
+        key, std::reference_wrapper<VulkanQueueList>(queueList)));
+VulkanQueueList &VulkanQueueFamilyToQueueListMap::operator[](uint32_t key)
+    return;
+// VulkanQueueList implementation //
+VulkanQueueList::VulkanQueueList(const VulkanQueueList &queueList) {}
+VulkanQueueList::VulkanQueueList() {}
+VulkanQueueList::~VulkanQueueList() {}
+// VulkanDescriptorSetLayoutBindingList implementation //
+    const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList)
+VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList() {}
+    size_t numDescriptorSetLayoutBindings, VulkanDescriptorType descriptorType,
+    uint32_t descriptorCount, VulkanShaderStage shaderStage)
+    for (size_t idx = 0; idx < numDescriptorSetLayoutBindings; idx++)
+    {
+        VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding =
+            new VulkanDescriptorSetLayoutBinding((uint32_t)idx, descriptorType,
+                                                 descriptorCount, shaderStage);
+        add(*descriptorSetLayoutBinding);
+    }
+    VulkanDescriptorType descriptorType0, uint32_t descriptorCount0,
+    VulkanDescriptorType descriptorType1, uint32_t descriptorCount1,
+    VulkanShaderStage shaderStage)
+    for (uint32_t idx = 0; idx < descriptorCount0; idx++)
+    {
+        VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding0 =
+            new VulkanDescriptorSetLayoutBinding(idx, descriptorType0, 1,
+                                                 shaderStage);
+        add(*descriptorSetLayoutBinding0);
+    }
+    for (uint32_t idx = 0; idx < descriptorCount1; idx++)
+    {
+        VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding1 =
+            new VulkanDescriptorSetLayoutBinding(
+                descriptorCount0 + idx, descriptorType1, 1, shaderStage);
+        add(*descriptorSetLayoutBinding1);
+    }
+    for (size_t idx = 0; idx < m_wrapperList.size(); idx++)
+    {
+        VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding =
+            m_wrapperList[idx];
+        delete &descriptorSetLayoutBinding;
+    }
+// VulkanDescriptorSetLayoutList implementation //
+    const VulkanDescriptorSetLayoutList &descriptorSetLayoutList)
+VulkanDescriptorSetLayoutList::VulkanDescriptorSetLayoutList() {}
+VulkanDescriptorSetLayoutList::~VulkanDescriptorSetLayoutList() {}
+// VulkanCommandBufferList implementation //
+    const VulkanCommandBufferList &commandBufferList)
+VulkanCommandBufferList::VulkanCommandBufferList() {}
+    size_t numCommandBuffers, const VulkanDevice &device,
+    const VulkanCommandPool &commandPool)
+    for (size_t idx = 0; idx < numCommandBuffers; idx++)
+    {
+        VulkanCommandBuffer *commandBuffer =
+            new VulkanCommandBuffer(device, commandPool);
+        add(*commandBuffer);
+    }
+    for (size_t idx = 0; idx < m_wrapperList.size(); idx++)
+    {
+        VulkanCommandBuffer &commandBuffer = m_wrapperList[idx];
+        delete &commandBuffer;
+    }
+// VulkanBufferList implementation //
+VulkanBufferList::VulkanBufferList(const VulkanBufferList &bufferList) {}
+    size_t numBuffers, const VulkanDevice &device, uint64_t size,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanBufferUsage bufferUsage, VulkanSharingMode sharingMode,
+    const VulkanQueueFamilyList &queueFamilyList)
+    for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
+    {
+        VulkanBuffer *buffer =
+            new VulkanBuffer(device, size, externalMemoryHandleType,
+                             bufferUsage, sharingMode, queueFamilyList);
+        add(*buffer);
+    }
+    for (size_t bIdx = 0; bIdx < m_wrapperList.size(); bIdx++)
+    {
+        VulkanBuffer &buffer = m_wrapperList[bIdx];
+        delete &buffer;
+    }
+// VulkanImage2DList implementation //
+VulkanImage2DList::VulkanImage2DList(const VulkanImage2DList &image2DList) {}
+    size_t numImages, std::vector<VulkanDeviceMemory *> &deviceMemory,
+    uint64_t baseOffset, uint64_t interImageOffset, const VulkanDevice &device,
+    VulkanFormat format, uint32_t width, uint32_t height, uint32_t mipLevels,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
+    VulkanSharingMode sharingMode)
+    for (size_t i2DIdx = 0; i2DIdx < numImages; i2DIdx++)
+    {
+        VulkanImage2D *image2D = new VulkanImage2D(
+            device, format, width, height, mipLevels, externalMemoryHandleType,
+            imageCreateFlag, imageUsage, sharingMode);
+        add(*image2D);
+        deviceMemory[i2DIdx]->bindImage(
+            *image2D, baseOffset + (i2DIdx * interImageOffset));
+    }
+    size_t numImages, const VulkanDevice &device, VulkanFormat format,
+    uint32_t width, uint32_t height, uint32_t mipLevels,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
+    VulkanSharingMode sharingMode)
+    for (size_t bIdx = 0; bIdx < numImages; bIdx++)
+    {
+        VulkanImage2D *image2D = new VulkanImage2D(
+            device, format, width, height, mipLevels, externalMemoryHandleType,
+            imageCreateFlag, imageUsage, sharingMode);
+        add(*image2D);
+    }
+    for (size_t i2DIdx = 0; i2DIdx < m_wrapperList.size(); i2DIdx++)
+    {
+        VulkanImage2D &image2D = m_wrapperList[i2DIdx];
+        delete &image2D;
+    }
+// VulkanImageViewList implementation //
+VulkanImageViewList::VulkanImageViewList(const VulkanImageViewList &image2DList)
+VulkanImageViewList::VulkanImageViewList(const VulkanDevice &device,
+                                         const VulkanImage2DList &image2DList,
+                                         bool createImageViewPerMipLevel)
+    for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++)
+    {
+        if (createImageViewPerMipLevel)
+        {
+            for (uint32_t mipLevel = 0;
+                 mipLevel < image2DList[i2DIdx].getNumMipLevels(); mipLevel++)
+            {
+                VulkanImageView *image2DView =
+                    new VulkanImageView(device, image2DList[i2DIdx],
+                                        VULKAN_IMAGE_VIEW_TYPE_2D, mipLevel, 1);
+                add(*image2DView);
+            }
+        }
+        else
+        {
+            VulkanImageView *image2DView = new VulkanImageView(
+                device, image2DList[i2DIdx], VULKAN_IMAGE_VIEW_TYPE_2D);
+            add(*image2DView);
+        }
+    }
+    for (size_t ivIdx = 0; ivIdx < m_wrapperList.size(); ivIdx++)
+    {
+        VulkanImageView &imageView = m_wrapperList[ivIdx];
+        delete &imageView;
+    }
+// VulkanDeviceMemoryList implementation //
+    const VulkanDeviceMemoryList &deviceMemoryList)
+    size_t numImages, const VulkanImage2DList &image2DList,
+    const VulkanDevice &device, const VulkanMemoryType &memoryType,
+    VulkanExternalMemoryHandleType externalMemoryHandleType)
+    for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++)
+    {
+        VulkanDeviceMemory *deviceMemory = new VulkanDeviceMemory(
+            device, image2DList[i2DIdx], memoryType, externalMemoryHandleType);
+        add(*deviceMemory);
+        deviceMemory->bindImage(image2DList[i2DIdx]);
+    }
+    for (size_t dmIdx = 0; dmIdx < m_wrapperList.size(); dmIdx++)
+    {
+        VulkanDeviceMemory &deviceMemory = m_wrapperList[dmIdx];
+        delete &deviceMemory;
+    }
+// VulkanSemaphoreList implementation //
+    const VulkanSemaphoreList &semaphoreList)
+VulkanSemaphoreList::VulkanSemaphoreList() {}
+    size_t numSemaphores, const VulkanDevice &device,
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+    const std::wstring namePrefix)
+    std::wstring name = L"";
+    for (size_t idx = 0; idx < numSemaphores; idx++)
+    {
+        if (namePrefix.size())
+        {
+            const size_t maxNameSize = 256;
+            wchar_t tempName[maxNameSize];
+            swprintf(tempName, maxNameSize, L"%s%d", namePrefix.c_str(),
+                     (int)idx);
+            name = tempName;
+        }
+        VulkanSemaphore *semaphore =
+            new VulkanSemaphore(device, externalSemaphoreHandleType, name);
+        add(*semaphore);
+    }
+    for (size_t idx = 0; idx < m_wrapperList.size(); idx++)
+    {
+        VulkanSemaphore &Semaphore = m_wrapperList[idx];
+        delete &Semaphore;
+    }
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp
new file mode 100644
index 0000000..5220677
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp
@@ -0,0 +1,386 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _vulkan_list_map_hpp_
+#define _vulkan_list_map_hpp_
+#include <functional>
+#include "vulkan_wrapper_types.hpp"
+#include "vulkan_utility.hpp"
+#include <iostream>
+template <class VulkanWrapper, class VulkanNative> class VulkanList {
+    std::vector<std::reference_wrapper<VulkanWrapper>> m_wrapperList;
+    std::vector<std::reference_wrapper<const VulkanWrapper>> m_constWrapperList;
+    std::vector<VulkanNative> m_nativeList;
+    VulkanList(const VulkanList &list);
+    VulkanList();
+    virtual ~VulkanList();
+    virtual void add(VulkanWrapper &wrapper);
+    virtual void add(const VulkanWrapper &wrapper);
+    virtual size_t size() const;
+    virtual const VulkanWrapper &operator[](size_t idx) const;
+    virtual VulkanWrapper &operator[](size_t idx);
+    virtual const VulkanNative *operator()() const;
+template <class VulkanKey, class VulkanValue> class VulkanMap {
+    std::map<VulkanKey, VulkanValue> m_map;
+    VulkanMap(const VulkanMap &map);
+    VulkanMap();
+    virtual ~VulkanMap();
+    void insert(const VulkanKey &key, VulkanValue &value);
+    const VulkanValue &operator[](const VulkanKey &key) const;
+    VulkanValue &operator[](const VulkanKey &key);
+class VulkanPhysicalDeviceList
+    : public VulkanList<VulkanPhysicalDevice, VkPhysicalDevice> {
+    friend class VulkanInstance;
+    VulkanPhysicalDeviceList(
+        const VulkanPhysicalDeviceList &physicalDeviceList);
+    VulkanPhysicalDeviceList();
+    virtual ~VulkanPhysicalDeviceList();
+class VulkanQueueFamilyList : public VulkanList<VulkanQueueFamily, uint32_t> {
+    friend class VulkanPhysicalDevice;
+    VulkanQueueFamilyList(const VulkanQueueFamilyList &queueFamilyList);
+    VulkanQueueFamilyList();
+    virtual ~VulkanQueueFamilyList();
+class VulkanMemoryHeapList : public VulkanList<VulkanMemoryHeap, uint32_t> {
+    friend class VulkanPhysicalDevice;
+    VulkanMemoryHeapList(const VulkanMemoryHeapList &memoryHeapList);
+    VulkanMemoryHeapList();
+    virtual ~VulkanMemoryHeapList();
+class VulkanMemoryTypeList : public VulkanList<VulkanMemoryType, uint32_t> {
+    friend class VulkanPhysicalDevice;
+    friend class VulkanBuffer;
+    friend class VulkanImage;
+    VulkanMemoryTypeList(const VulkanMemoryTypeList &memoryTypeList);
+    VulkanMemoryTypeList();
+    virtual ~VulkanMemoryTypeList();
+class VulkanQueueFamilyToQueueCountMap : public VulkanMap<uint32_t, uint32_t> {
+    VulkanQueueFamilyToQueueCountMap(
+        const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap);
+    VulkanQueueFamilyToQueueCountMap(uint32_t numQueuesPerFamily = 0);
+    virtual ~VulkanQueueFamilyToQueueCountMap();
+class VulkanQueueList : public VulkanList<VulkanQueue, VkQueue> {
+    friend class VulkanDevice;
+    VulkanQueueList(const VulkanQueueList &queueList);
+    VulkanQueueList();
+    virtual ~VulkanQueueList();
+class VulkanQueueFamilyToQueueListMap
+    : public VulkanMap<uint32_t, std::reference_wrapper<VulkanQueueList>> {
+    VulkanQueueFamilyToQueueListMap(
+        const VulkanQueueFamilyToQueueListMap &queueFamilyToQueueMap);
+    VulkanQueueFamilyToQueueListMap();
+    virtual ~VulkanQueueFamilyToQueueListMap();
+    void insert(uint32_t key, VulkanQueueList &queueList);
+    VulkanQueueList &operator[](uint32_t key);
+class VulkanDescriptorSetLayoutBindingList
+    : public VulkanList<VulkanDescriptorSetLayoutBinding,
+                        VkDescriptorSetLayoutBinding> {
+    VulkanDescriptorSetLayoutBindingList(
+        const VulkanDescriptorSetLayoutBindingList
+            &descriptorSetLayoutBindingList);
+    VulkanDescriptorSetLayoutBindingList();
+    VulkanDescriptorSetLayoutBindingList(
+        size_t numDescriptorSetLayoutBindings,
+        VulkanDescriptorType descriptorType, uint32_t descriptorCount = 1,
+        VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE);
+    VulkanDescriptorSetLayoutBindingList(
+        VulkanDescriptorType descriptorType0, uint32_t descriptorCount0,
+        VulkanDescriptorType descriptorType1, uint32_t descriptorCount1,
+        VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE);
+    virtual ~VulkanDescriptorSetLayoutBindingList();
+class VulkanDescriptorSetLayoutList
+    : public VulkanList<VulkanDescriptorSetLayout, VkDescriptorSetLayout> {
+    VulkanDescriptorSetLayoutList(
+        const VulkanDescriptorSetLayoutList &descriptorSetLayoutList);
+    VulkanDescriptorSetLayoutList();
+    virtual ~VulkanDescriptorSetLayoutList();
+class VulkanCommandBufferList
+    : public VulkanList<VulkanCommandBuffer, VkCommandBuffer> {
+    VulkanCommandBufferList(const VulkanCommandBufferList &commandBufferList);
+    VulkanCommandBufferList();
+    VulkanCommandBufferList(size_t numCommandBuffers,
+                            const VulkanDevice &device,
+                            const VulkanCommandPool &commandPool);
+    virtual ~VulkanCommandBufferList();
+class VulkanBufferList : public VulkanList<VulkanBuffer, VkBuffer> {
+    VulkanBufferList(const VulkanBufferList &bufferList);
+    VulkanBufferList(
+        size_t numBuffers, const VulkanDevice &device, uint64_t size,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+        VulkanBufferUsage bufferUsage =
+        VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE,
+        const VulkanQueueFamilyList &queueFamilyList =
+            getEmptyVulkanQueueFamilyList());
+    virtual ~VulkanBufferList();
+class VulkanImage2DList : public VulkanList<VulkanImage2D, VkImage> {
+    VulkanImage2DList(const VulkanImage2DList &image2DList);
+    VulkanImage2DList(
+        size_t numImages, std::vector<VulkanDeviceMemory *> &deviceMemory,
+        uint64_t baseOffset, uint64_t interImageOffset,
+        const VulkanDevice &device, VulkanFormat format, uint32_t width,
+        uint32_t height, uint32_t mipLevels,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+        VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
+        VulkanImageUsage imageUsage =
+        VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
+    VulkanImage2DList(
+        size_t numImages, const VulkanDevice &device, VulkanFormat format,
+        uint32_t width, uint32_t height, uint32_t mipLevels = 1,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+        VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
+        VulkanImageUsage imageUsage =
+        VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
+    virtual ~VulkanImage2DList();
+class VulkanImageViewList : public VulkanList<VulkanImageView, VkImageView> {
+    VulkanImageViewList(const VulkanImageViewList &imageViewList);
+    VulkanImageViewList(const VulkanDevice &device,
+                        const VulkanImage2DList &image2DList,
+                        bool createImageViewPerMipLevel = true);
+    virtual ~VulkanImageViewList();
+class VulkanDeviceMemoryList
+    : public VulkanList<VulkanDeviceMemory, VkDeviceMemory> {
+    VulkanDeviceMemoryList(const VulkanDeviceMemoryList &deviceMemoryList);
+    VulkanDeviceMemoryList(
+        size_t numImages, const VulkanImage2DList &image2DList,
+        const VulkanDevice &device, const VulkanMemoryType &memoryType,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+    virtual ~VulkanDeviceMemoryList();
+class VulkanSemaphoreList : public VulkanList<VulkanSemaphore, VkSemaphore> {
+    VulkanSemaphoreList(const VulkanSemaphoreList &semaphoreList);
+    VulkanSemaphoreList();
+    VulkanSemaphoreList(
+        size_t numSemaphores, const VulkanDevice &device,
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType =
+        const std::wstring namePrefix = L"");
+    virtual ~VulkanSemaphoreList();
+// VulkanList implementation //
+template <class VulkanWrapper, class VulkanNative>
+VulkanList<VulkanWrapper, VulkanNative>::VulkanList(const VulkanList &list)
+    : m_wrapperList(list.m_wrapperList),
+      m_constWrapperList(list.m_constWrapperList),
+      m_nativeList(list.m_nativeList)
+template <class VulkanWrapper, class VulkanNative>
+VulkanList<VulkanWrapper, VulkanNative>::VulkanList()
+template <class VulkanWrapper, class VulkanNative>
+VulkanList<VulkanWrapper, VulkanNative>::~VulkanList()
+template <class VulkanWrapper, class VulkanNative>
+void VulkanList<VulkanWrapper, VulkanNative>::add(VulkanWrapper &wrapper)
+    if (m_constWrapperList.size() != size_t(0))
+    {
+        std::cout << "This list can only contain externally allocated objects"
+                  << std::endl;
+        return;
+    }
+    m_wrapperList.push_back(std::reference_wrapper<VulkanWrapper>(wrapper));
+    m_nativeList.push_back((VulkanNative)wrapper);
+template <class VulkanWrapper, class VulkanNative>
+void VulkanList<VulkanWrapper, VulkanNative>::add(const VulkanWrapper &wrapper)
+    if (m_wrapperList.size() != size_t(0))
+    {
+        std::cout << "This list cannot contain externally allocated objects"
+                  << std::endl;
+        return;
+    }
+    m_constWrapperList.push_back(
+        std::reference_wrapper<const VulkanWrapper>(wrapper));
+    m_nativeList.push_back((VulkanNative)wrapper);
+template <class VulkanWrapper, class VulkanNative>
+size_t VulkanList<VulkanWrapper, VulkanNative>::size() const
+    return (m_wrapperList.size() > 0) ? m_wrapperList.size()
+                                      : m_constWrapperList.size();
+template <class VulkanWrapper, class VulkanNative>
+const VulkanWrapper &
+    VulkanList<VulkanWrapper, VulkanNative>::operator[](size_t idx) const
+    if (idx < size())
+    {
+        // CHECK_LT(idx, size());
+        return (m_wrapperList.size() > 0) ? m_wrapperList[idx].get()
+                                          : m_constWrapperList[idx].get();
+    }
+template <class VulkanWrapper, class VulkanNative>
+VulkanWrapper &VulkanList<VulkanWrapper, VulkanNative>::operator[](size_t idx)
+    // CHECK_LT(idx, m_wrapperList.size());
+    return m_wrapperList[idx].get();
+template <class VulkanWrapper, class VulkanNative>
+const VulkanNative *VulkanList<VulkanWrapper, VulkanNative>::operator()() const
+    return;
+// VulkanMap implementation //
+template <class VulkanKey, class VulkanValue>
+VulkanMap<VulkanKey, VulkanValue>::VulkanMap(const VulkanMap &map)
+    : m_map(map.m_map)
+template <class VulkanKey, class VulkanValue>
+VulkanMap<VulkanKey, VulkanValue>::VulkanMap()
+template <class VulkanKey, class VulkanValue>
+VulkanMap<VulkanKey, VulkanValue>::~VulkanMap()
+template <class VulkanKey, class VulkanValue>
+void VulkanMap<VulkanKey, VulkanValue>::insert(const VulkanKey &key,
+                                               VulkanValue &value)
+    m_map.insert(std::pair<VulkanKey, std::reference_wrapper<VulkanValue>>(
+        key, std::reference_wrapper<VulkanValue>(value)));
+template <class VulkanKey, class VulkanValue>
+const VulkanValue &
+    VulkanMap<VulkanKey, VulkanValue>::operator[](const VulkanKey &key) const
+    return;
+template <class VulkanKey, class VulkanValue>
+VulkanValue &VulkanMap<VulkanKey, VulkanValue>::operator[](const VulkanKey &key)
+    return;
+#endif // _vulkan_list_map_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp
new file mode 100644
index 0000000..1a313cc
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp
@@ -0,0 +1,692 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "vulkan_utility.hpp"
+#include "vulkan_wrapper.hpp"
+#include <assert.h>
+#include <iostream>
+#include <fstream>
+#include <set>
+#include <string>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include <versionhelpers.h>
+#define ASSERT(x) assert((x))
+#define BUFFERSIZE 3000
+const VulkanInstance &getVulkanInstance()
+    static VulkanInstance instance;
+    return instance;
+const VulkanPhysicalDevice &getVulkanPhysicalDevice()
+    size_t pdIdx;
+    cl_int errNum = 0;
+    cl_platform_id platform = NULL;
+    cl_uchar uuid[CL_UUID_SIZE_KHR];
+    cl_device_id *devices;
+    char *extensions = NULL;
+    size_t extensionSize = 0;
+    cl_uint num_devices = 0;
+    cl_uint device_no = 0;
+    const size_t bufsize = BUFFERSIZE;
+    char buf[BUFFERSIZE];
+    const VulkanInstance &instance = getVulkanInstance();
+    const VulkanPhysicalDeviceList &physicalDeviceList =
+        instance.getPhysicalDeviceList();
+    // get the platform ID
+    errNum = clGetPlatformIDs(1, &platform, NULL);
+    if (errNum != CL_SUCCESS)
+    {
+        printf("Error: Failed to get platform\n");
+        throw std::runtime_error("Error: Failed to get number of platform\n");
+    }
+    errNum =
+        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    if (CL_SUCCESS != errNum)
+    {
+        throw std::runtime_error(
+            "Error: clGetDeviceIDs failed in returning of devices\n");
+    }
+    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+    if (NULL == devices)
+    {
+        throw std::runtime_error(
+            "Error: Unable to allocate memory for devices\n");
+    }
+    errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices,
+                            NULL);
+    if (CL_SUCCESS != errNum)
+    {
+        throw std::runtime_error("Error: Failed to get deviceID.\n");
+    }
+    bool is_selected = false;
+    for (device_no = 0; device_no < num_devices; device_no++)
+    {
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0,
+                                 NULL, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            throw std::runtime_error("Error in clGetDeviceInfo for getting "
+                                     "device_extension size....\n");
+        }
+        extensions = (char *)malloc(extensionSize);
+        if (NULL == extensions)
+        {
+            throw std::runtime_error(
+                "Unable to allocate memory for extensions\n");
+        }
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS,
+                                 extensionSize, extensions, NULL);
+        if (CL_SUCCESS != errNum)
+        {
+            throw std::runtime_error("Error: Error in clGetDeviceInfo for "
+                                     "getting device_extension\n");
+        }
+        errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR,
+                                 CL_UUID_SIZE_KHR, uuid, &extensionSize);
+        if (CL_SUCCESS != errNum)
+        {
+            throw std::runtime_error(
+                "Error: clGetDeviceInfo failed with error\n");
+        }
+        free(extensions);
+        for (pdIdx = 0; pdIdx < physicalDeviceList.size(); pdIdx++)
+        {
+            if (!memcmp(&uuid, physicalDeviceList[pdIdx].getUUID(),
+                        VK_UUID_SIZE))
+            {
+                std::cout << "Selected physical device = "
+                          << physicalDeviceList[pdIdx] << std::endl;
+                is_selected = true;
+                break;
+            }
+        }
+        if (is_selected)
+        {
+            break;
+        }
+    }
+    if ((pdIdx >= physicalDeviceList.size())
+        || (physicalDeviceList[pdIdx] == (VkPhysicalDevice)VK_NULL_HANDLE))
+    {
+        throw std::runtime_error("failed to find a suitable GPU!");
+    }
+    std::cout << "Selected physical device is: " << physicalDeviceList[pdIdx]
+              << std::endl;
+    return physicalDeviceList[pdIdx];
+const VulkanQueueFamily &getVulkanQueueFamily(uint32_t queueFlags)
+    size_t qfIdx;
+    const VulkanPhysicalDevice &physicalDevice = getVulkanPhysicalDevice();
+    const VulkanQueueFamilyList &queueFamilyList =
+        physicalDevice.getQueueFamilyList();
+    for (qfIdx = 0; qfIdx < queueFamilyList.size(); qfIdx++)
+    {
+        if ((queueFamilyList[qfIdx].getQueueFlags() & queueFlags) == queueFlags)
+        {
+            break;
+        }
+    }
+    return queueFamilyList[qfIdx];
+const VulkanMemoryType &
+getVulkanMemoryType(const VulkanDevice &device,
+                    VulkanMemoryTypeProperty memoryTypeProperty)
+    size_t mtIdx;
+    const VulkanMemoryTypeList &memoryTypeList =
+        device.getPhysicalDevice().getMemoryTypeList();
+    for (mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+    {
+        if ((memoryTypeList[mtIdx].getMemoryTypeProperty() & memoryTypeProperty)
+            == memoryTypeProperty)
+        {
+            break;
+        }
+    }
+    // CHECK_LT(mtIdx, memoryTypeList.size());
+    return memoryTypeList[mtIdx];
+bool checkVkSupport()
+    bool result = true;
+    const VulkanInstance &instance = getVulkanInstance();
+    const VulkanPhysicalDeviceList &physicalDeviceList =
+        instance.getPhysicalDeviceList();
+    if (physicalDeviceList() == NULL)
+    {
+        std::cout << "physicalDeviceList is null, No GPUs found with "
+                     "Vulkan support !!!\n";
+        result = false;
+    }
+    return result;
+const VulkanQueueFamilyList &getEmptyVulkanQueueFamilyList()
+    static VulkanQueueFamilyList queueFamilyList;
+    return queueFamilyList;
+const VulkanDescriptorSetLayoutList &getEmptyVulkanDescriptorSetLayoutList()
+    static VulkanDescriptorSetLayoutList descriptorSetLayoutList;
+    return descriptorSetLayoutList;
+const VulkanQueueFamilyToQueueCountMap &
+    static VulkanQueueFamilyToQueueCountMap queueFamilyToQueueCountMap(1);
+    return queueFamilyToQueueCountMap;
+const std::vector<VulkanExternalMemoryHandleType>
+    std::vector<VulkanExternalMemoryHandleType> externalMemoryHandleTypeList;
+#if _WIN32
+    if (IsWindows8OrGreater())
+    {
+        externalMemoryHandleTypeList.push_back(
+    }
+    externalMemoryHandleTypeList.push_back(
+    externalMemoryHandleTypeList.push_back(
+    return externalMemoryHandleTypeList;
+const std::vector<VulkanExternalSemaphoreHandleType>
+    std::vector<VulkanExternalSemaphoreHandleType>
+        externalSemaphoreHandleTypeList;
+#if _WIN32
+    if (IsWindows8OrGreater())
+    {
+        externalSemaphoreHandleTypeList.push_back(
+    }
+    externalSemaphoreHandleTypeList.push_back(
+    externalSemaphoreHandleTypeList.push_back(
+    return externalSemaphoreHandleTypeList;
+const std::vector<VulkanFormat> getSupportedVulkanFormatList()
+    std::vector<VulkanFormat> formatList;
+    formatList.push_back(VULKAN_FORMAT_R8_UINT);
+    formatList.push_back(VULKAN_FORMAT_R8_SINT);
+    formatList.push_back(VULKAN_FORMAT_R8G8_UINT);
+    formatList.push_back(VULKAN_FORMAT_R8G8_SINT);
+    formatList.push_back(VULKAN_FORMAT_R8G8B8A8_UINT);
+    formatList.push_back(VULKAN_FORMAT_R8G8B8A8_SINT);
+    formatList.push_back(VULKAN_FORMAT_R16_UINT);
+    formatList.push_back(VULKAN_FORMAT_R16_SINT);
+    formatList.push_back(VULKAN_FORMAT_R16G16_UINT);
+    formatList.push_back(VULKAN_FORMAT_R16G16_SINT);
+    formatList.push_back(VULKAN_FORMAT_R16G16B16A16_UINT);
+    formatList.push_back(VULKAN_FORMAT_R16G16B16A16_SINT);
+    formatList.push_back(VULKAN_FORMAT_R32_UINT);
+    formatList.push_back(VULKAN_FORMAT_R32_SINT);
+    formatList.push_back(VULKAN_FORMAT_R32_SFLOAT);
+    formatList.push_back(VULKAN_FORMAT_R32G32_UINT);
+    formatList.push_back(VULKAN_FORMAT_R32G32_SINT);
+    formatList.push_back(VULKAN_FORMAT_R32G32_SFLOAT);
+    formatList.push_back(VULKAN_FORMAT_R32G32B32A32_UINT);
+    formatList.push_back(VULKAN_FORMAT_R32G32B32A32_SINT);
+    formatList.push_back(VULKAN_FORMAT_R32G32B32A32_SFLOAT);
+    for (size_t fIdx = 0; fIdx < formatList.size(); fIdx++)
+    {
+        switch (formatList[fIdx])
+        {
+            case VULKAN_FORMAT_R8_UINT:
+            case VULKAN_FORMAT_R8_SINT:
+            case VULKAN_FORMAT_R8G8_UINT:
+            case VULKAN_FORMAT_R8G8_SINT:
+            case VULKAN_FORMAT_R8G8B8A8_UINT:
+            case VULKAN_FORMAT_R8G8B8A8_SINT:
+            case VULKAN_FORMAT_R16_UINT:
+            case VULKAN_FORMAT_R16_SINT:
+            case VULKAN_FORMAT_R16G16_UINT:
+            case VULKAN_FORMAT_R16G16_SINT:
+            case VULKAN_FORMAT_R16G16B16A16_UINT:
+            case VULKAN_FORMAT_R16G16B16A16_SINT:
+            case VULKAN_FORMAT_R32_UINT:
+            case VULKAN_FORMAT_R32_SINT:
+            case VULKAN_FORMAT_R32_SFLOAT:
+            case VULKAN_FORMAT_R32G32_UINT:
+            case VULKAN_FORMAT_R32G32_SINT:
+            case VULKAN_FORMAT_R32G32_SFLOAT:
+            case VULKAN_FORMAT_R32G32B32A32_UINT:
+            case VULKAN_FORMAT_R32G32B32A32_SINT:
+            case VULKAN_FORMAT_R32G32B32A32_SFLOAT: break;
+            case VULKAN_FORMAT_UNDEFINED:
+            case VULKAN_FORMAT_R4G4_UNORM_PACK8:
+            case VULKAN_FORMAT_R4G4B4A4_UNORM_PACK16:
+            case VULKAN_FORMAT_B4G4R4A4_UNORM_PACK16:
+            case VULKAN_FORMAT_R5G6B5_UNORM_PACK16:
+            case VULKAN_FORMAT_B5G6R5_UNORM_PACK16:
+            case VULKAN_FORMAT_R5G5B5A1_UNORM_PACK16:
+            case VULKAN_FORMAT_B5G5R5A1_UNORM_PACK16:
+            case VULKAN_FORMAT_A1R5G5B5_UNORM_PACK16:
+            case VULKAN_FORMAT_R8_UNORM:
+            case VULKAN_FORMAT_R8_SNORM:
+            case VULKAN_FORMAT_R8_USCALED:
+            case VULKAN_FORMAT_R8_SSCALED:
+            case VULKAN_FORMAT_R8_SRGB:
+            case VULKAN_FORMAT_R8G8_SNORM:
+            case VULKAN_FORMAT_R8G8_UNORM:
+            case VULKAN_FORMAT_R8G8_USCALED:
+            case VULKAN_FORMAT_R8G8_SSCALED:
+            case VULKAN_FORMAT_R8G8_SRGB:
+            case VULKAN_FORMAT_R8G8B8_UNORM:
+            case VULKAN_FORMAT_R8G8B8_SNORM:
+            case VULKAN_FORMAT_R8G8B8_USCALED:
+            case VULKAN_FORMAT_R8G8B8_SSCALED:
+            case VULKAN_FORMAT_R8G8B8_UINT:
+            case VULKAN_FORMAT_R8G8B8_SINT:
+            case VULKAN_FORMAT_R8G8B8_SRGB:
+            case VULKAN_FORMAT_B8G8R8_UNORM:
+            case VULKAN_FORMAT_B8G8R8_SNORM:
+            case VULKAN_FORMAT_B8G8R8_USCALED:
+            case VULKAN_FORMAT_B8G8R8_SSCALED:
+            case VULKAN_FORMAT_B8G8R8_UINT:
+            case VULKAN_FORMAT_B8G8R8_SINT:
+            case VULKAN_FORMAT_B8G8R8_SRGB:
+            case VULKAN_FORMAT_R8G8B8A8_UNORM:
+            case VULKAN_FORMAT_R8G8B8A8_SNORM:
+            case VULKAN_FORMAT_R8G8B8A8_USCALED:
+            case VULKAN_FORMAT_R8G8B8A8_SSCALED:
+            case VULKAN_FORMAT_R8G8B8A8_SRGB:
+            case VULKAN_FORMAT_B8G8R8A8_UNORM:
+            case VULKAN_FORMAT_B8G8R8A8_SNORM:
+            case VULKAN_FORMAT_B8G8R8A8_USCALED:
+            case VULKAN_FORMAT_B8G8R8A8_SSCALED:
+            case VULKAN_FORMAT_B8G8R8A8_UINT:
+            case VULKAN_FORMAT_B8G8R8A8_SINT:
+            case VULKAN_FORMAT_B8G8R8A8_SRGB:
+            case VULKAN_FORMAT_A8B8G8R8_UNORM_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_SNORM_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_USCALED_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_SSCALED_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_UINT_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_SINT_PACK32:
+            case VULKAN_FORMAT_A8B8G8R8_SRGB_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_UNORM_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_SNORM_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_USCALED_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_SSCALED_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_UINT_PACK32:
+            case VULKAN_FORMAT_A2R10G10B10_SINT_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_UNORM_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_SNORM_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_USCALED_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_SSCALED_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_UINT_PACK32:
+            case VULKAN_FORMAT_A2B10G10R10_SINT_PACK32:
+            case VULKAN_FORMAT_R16_UNORM:
+            case VULKAN_FORMAT_R16_SNORM:
+            case VULKAN_FORMAT_R16_USCALED:
+            case VULKAN_FORMAT_R16_SSCALED:
+            case VULKAN_FORMAT_R16_SFLOAT:
+            case VULKAN_FORMAT_R16G16_UNORM:
+            case VULKAN_FORMAT_R16G16_SNORM:
+            case VULKAN_FORMAT_R16G16_USCALED:
+            case VULKAN_FORMAT_R16G16_SSCALED:
+            case VULKAN_FORMAT_R16G16_SFLOAT:
+            case VULKAN_FORMAT_R16G16B16_UNORM:
+            case VULKAN_FORMAT_R16G16B16_SNORM:
+            case VULKAN_FORMAT_R16G16B16_USCALED:
+            case VULKAN_FORMAT_R16G16B16_SSCALED:
+            case VULKAN_FORMAT_R16G16B16_UINT:
+            case VULKAN_FORMAT_R16G16B16_SINT:
+            case VULKAN_FORMAT_R16G16B16_SFLOAT:
+            case VULKAN_FORMAT_R16G16B16A16_UNORM:
+            case VULKAN_FORMAT_R16G16B16A16_SNORM:
+            case VULKAN_FORMAT_R16G16B16A16_USCALED:
+            case VULKAN_FORMAT_R16G16B16A16_SSCALED:
+            case VULKAN_FORMAT_R16G16B16A16_SFLOAT:
+            case VULKAN_FORMAT_R32G32B32_UINT:
+            case VULKAN_FORMAT_R32G32B32_SINT:
+            case VULKAN_FORMAT_R32G32B32_SFLOAT:
+            case VULKAN_FORMAT_R64_UINT:
+            case VULKAN_FORMAT_R64_SINT:
+            case VULKAN_FORMAT_R64_SFLOAT:
+            case VULKAN_FORMAT_R64G64_UINT:
+            case VULKAN_FORMAT_R64G64_SINT:
+            case VULKAN_FORMAT_R64G64_SFLOAT:
+            case VULKAN_FORMAT_R64G64B64_UINT:
+            case VULKAN_FORMAT_R64G64B64_SINT:
+            case VULKAN_FORMAT_R64G64B64_SFLOAT:
+            case VULKAN_FORMAT_R64G64B64A64_UINT:
+            case VULKAN_FORMAT_R64G64B64A64_SINT:
+            case VULKAN_FORMAT_R64G64B64A64_SFLOAT:
+            case VULKAN_FORMAT_B10G11R11_UFLOAT_PACK32:
+            case VULKAN_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+            case VULKAN_FORMAT_D16_UNORM:
+            case VULKAN_FORMAT_X8_D24_UNORM_PACK32:
+            case VULKAN_FORMAT_D32_SFLOAT:
+            case VULKAN_FORMAT_S8_UINT:
+            case VULKAN_FORMAT_D16_UNORM_S8_UINT:
+            case VULKAN_FORMAT_D24_UNORM_S8_UINT:
+            case VULKAN_FORMAT_D32_SFLOAT_S8_UINT:
+            case VULKAN_FORMAT_BC2_UNORM_BLOCK:
+            case VULKAN_FORMAT_BC2_SRGB_BLOCK:
+            case VULKAN_FORMAT_BC3_UNORM_BLOCK:
+            case VULKAN_FORMAT_BC3_SRGB_BLOCK:
+            case VULKAN_FORMAT_BC4_UNORM_BLOCK:
+            case VULKAN_FORMAT_BC4_SNORM_BLOCK:
+            case VULKAN_FORMAT_BC5_UNORM_BLOCK:
+            case VULKAN_FORMAT_BC5_SNORM_BLOCK:
+            case VULKAN_FORMAT_BC7_UNORM_BLOCK:
+            case VULKAN_FORMAT_BC7_SRGB_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
+            case VULKAN_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+            case VULKAN_FORMAT_EAC_R11_UNORM_BLOCK:
+            case VULKAN_FORMAT_EAC_R11_SNORM_BLOCK:
+            case VULKAN_FORMAT_EAC_R11G11_UNORM_BLOCK:
+            case VULKAN_FORMAT_EAC_R11G11_SNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_4x4_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_4x4_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_5x4_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_5x4_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_5x5_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_5x5_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_6x5_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_6x5_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_6x6_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_6x6_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x5_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x5_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x6_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x6_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x8_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_8x8_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x5_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x5_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x6_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x6_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x8_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x8_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x10_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_10x10_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_12x10_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_12x10_SRGB_BLOCK:
+            case VULKAN_FORMAT_ASTC_12x12_UNORM_BLOCK:
+            case VULKAN_FORMAT_ASTC_12x12_SRGB_BLOCK:
+                ASSERT(0);
+                std::cout << "Unsupport texture format";
+        }
+    }
+    return formatList;
+uint32_t getVulkanFormatElementSize(VulkanFormat format)
+    switch (format)
+    {
+        case VULKAN_FORMAT_R8_UINT: return uint32_t(1);
+        case VULKAN_FORMAT_R8_SINT: return uint32_t(1);
+        case VULKAN_FORMAT_R8G8_UINT: return uint32_t(2);
+        case VULKAN_FORMAT_R8G8_SINT: return uint32_t(2);
+        case VULKAN_FORMAT_R8G8B8A8_UINT: return uint32_t(4);
+        case VULKAN_FORMAT_R8G8B8A8_SINT: return uint32_t(4);
+        case VULKAN_FORMAT_R16_UINT: return uint32_t(2);
+        case VULKAN_FORMAT_R16_SINT: return uint32_t(2);
+        case VULKAN_FORMAT_R16G16_UINT: return uint32_t(4);
+        case VULKAN_FORMAT_R16G16_SINT: return uint32_t(4);
+        case VULKAN_FORMAT_R16G16B16A16_UINT: return uint32_t(8);
+        case VULKAN_FORMAT_R16G16B16A16_SINT: return uint32_t(8);
+        case VULKAN_FORMAT_R32_UINT: return uint32_t(4);
+        case VULKAN_FORMAT_R32_SINT: return uint32_t(4);
+        case VULKAN_FORMAT_R32_SFLOAT: return uint32_t(4);
+        case VULKAN_FORMAT_R32G32_UINT: return uint32_t(8);
+        case VULKAN_FORMAT_R32G32_SINT: return uint32_t(8);
+        case VULKAN_FORMAT_R32G32_SFLOAT: return uint32_t(8);
+        case VULKAN_FORMAT_R32G32B32A32_UINT: return uint32_t(16);
+        case VULKAN_FORMAT_R32G32B32A32_SINT: return uint32_t(16);
+        case VULKAN_FORMAT_R32G32B32A32_SFLOAT: return uint32_t(16);
+        default: ASSERT(0); std::cout << "Unknown format";
+    }
+    return uint32_t(0);
+const char *getVulkanFormatGLSLFormat(VulkanFormat format)
+    switch (format)
+    {
+        case VULKAN_FORMAT_R8_UINT: return "r8ui";
+        case VULKAN_FORMAT_R8_SINT: return "r8i";
+        case VULKAN_FORMAT_R8G8_UINT: return "rg8ui";
+        case VULKAN_FORMAT_R8G8_SINT: return "rg8i";
+        case VULKAN_FORMAT_R8G8B8A8_UINT: return "rgba8ui";
+        case VULKAN_FORMAT_R8G8B8A8_SINT: return "rgba8i";
+        case VULKAN_FORMAT_R16_UINT: return "r16ui";
+        case VULKAN_FORMAT_R16_SINT: return "r16i";
+        case VULKAN_FORMAT_R16G16_UINT: return "rg16ui";
+        case VULKAN_FORMAT_R16G16_SINT: return "rg16i";
+        case VULKAN_FORMAT_R16G16B16A16_UINT: return "rgba16ui";
+        case VULKAN_FORMAT_R16G16B16A16_SINT: return "rgba16i";
+        case VULKAN_FORMAT_R32_UINT: return "r32ui";
+        case VULKAN_FORMAT_R32_SINT: return "r32i";
+        case VULKAN_FORMAT_R32_SFLOAT: return "r32f";
+        case VULKAN_FORMAT_R32G32_UINT: return "rg32ui";
+        case VULKAN_FORMAT_R32G32_SINT: return "rg32i";
+        case VULKAN_FORMAT_R32G32_SFLOAT: return "rg32f";
+        case VULKAN_FORMAT_R32G32B32A32_UINT: return "rgba32ui";
+        case VULKAN_FORMAT_R32G32B32A32_SINT: return "rgba32i";
+        case VULKAN_FORMAT_R32G32B32A32_SFLOAT: return "rgba32f";
+        default: ASSERT(0); std::cout << "Unknown format";
+    }
+    return (const char *)size_t(0);
+std::ostream &operator<<(std::ostream &os,
+                         VulkanMemoryTypeProperty memoryTypeProperty)
+    switch (memoryTypeProperty)
+    {
+        case VULKAN_MEMORY_TYPE_PROPERTY_NONE: return os << "None";
+            return os << "Device local";
+            return os << "Host visible and coherent";
+            return os << "Host visible and cached";
+            return os << "Host visible, cached and coherent";
+            return os << "Device local, Host visible and coherent";
+            return os << "Device local, Host visible and cached";
+            return os << "Device local, Host visible, cached and coherent";
+    }
+    return os;
+std::ostream &
+operator<<(std::ostream &os,
+           VulkanExternalMemoryHandleType externalMemoryHandleType)
+    switch (externalMemoryHandleType)
+    {
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE: return os << "None";
+            return os << "Opaque file descriptor";
+            return os << "Opaque NT handle";
+            return os << "Opaque D3DKMT handle";
+            return os << "Opaque NT and D3DKMT handle";
+    }
+    return os;
+std::ostream &
+operator<<(std::ostream &os,
+           VulkanExternalSemaphoreHandleType externalSemaphoreHandleType)
+    switch (externalSemaphoreHandleType)
+    {
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE: return os << "None";
+            return os << "Opaque file descriptor";
+            return os << "Opaque NT handle";
+            return os << "Opaque D3DKMT handle";
+            return os << "Opaque NT and D3DKMT handle";
+    }
+    return os;
+std::ostream &operator<<(std::ostream &os, VulkanFormat format)
+    switch (format)
+    {
+        case VULKAN_FORMAT_R8_UINT: return os << "R8_UINT";
+        case VULKAN_FORMAT_R8_SINT: return os << "R8_SINT";
+        case VULKAN_FORMAT_R8G8_UINT: return os << "R8G8_UINT";
+        case VULKAN_FORMAT_R8G8_SINT: return os << "R8G8_SINT";
+        case VULKAN_FORMAT_R8G8B8A8_UINT: return os << "R8G8B8A8_UINT";
+        case VULKAN_FORMAT_R8G8B8A8_SINT: return os << "R8G8B8A8_SINT";
+        case VULKAN_FORMAT_R16_UINT: return os << "R16_UINT";
+        case VULKAN_FORMAT_R16_SINT: return os << "R16_SINT";
+        case VULKAN_FORMAT_R16G16_UINT: return os << "R16G16_UINT";
+        case VULKAN_FORMAT_R16G16_SINT: return os << "R16G16_SINT";
+        case VULKAN_FORMAT_R16G16B16A16_UINT: return os << "R16G16B16A16_UINT";
+        case VULKAN_FORMAT_R16G16B16A16_SINT: return os << "R16G16B16A16_SINT";
+        case VULKAN_FORMAT_R32_UINT: return os << "R32_UINT";
+        case VULKAN_FORMAT_R32_SINT: return os << "R32_SINT";
+        case VULKAN_FORMAT_R32_SFLOAT: return os << "R32_SFLOAT";
+        case VULKAN_FORMAT_R32G32_UINT: return os << "R32G32_UINT";
+        case VULKAN_FORMAT_R32G32_SINT: return os << "R32G32_SINT";
+        case VULKAN_FORMAT_R32G32_SFLOAT: return os << "R32G32_SFLOAT";
+        case VULKAN_FORMAT_R32G32B32A32_UINT: return os << "R32G32B32A32_UINT";
+        case VULKAN_FORMAT_R32G32B32A32_SINT: return os << "R32G32B32A32_SINT";
+        case VULKAN_FORMAT_R32G32B32A32_SFLOAT:
+            return os << "R32G32B32A32_SFLOAT";
+            break;
+        default: ASSERT(0); std::cout << "Unknown format";
+    }
+    return os;
+static char *findFilePath(const std::string filename)
+    const char *searchPath[] = {
+        "./", // Same dir
+        "./shaders/", // In shaders folder in same dir
+        "../test_conformance/vulkan/shaders/" // In src folder
+    };
+    for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i)
+    {
+        std::string path(searchPath[i]);
+        path.append(filename);
+        FILE *fp;
+        fp = fopen(path.c_str(), "rb");
+        if (fp != NULL)
+        {
+            fclose(fp);
+            // File found
+            char *file_path = (char *)(malloc(path.length() + 1));
+            strncpy(file_path, path.c_str(), path.length() + 1);
+            return file_path;
+        }
+        if (fp)
+        {
+            fclose(fp);
+        }
+    }
+    // File not found
+    return 0;
+std::vector<char> readFile(const std::string &filename)
+    char *file_path = findFilePath(filename);
+    std::ifstream file(file_path, std::ios::ate | std::ios::binary);
+    if (!file.is_open())
+    {
+        throw std::runtime_error("failed to open shader spv file!\n");
+    }
+    size_t fileSize = (size_t)file.tellg();
+    std::vector<char> buffer(fileSize);
+    file.seekg(0);
+, fileSize);
+    file.close();
+    printf("filesize is %d", fileSize);
+    return buffer;
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp
new file mode 100644
index 0000000..04f5a59
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _vulkan_utility_hpp_
+#define _vulkan_utility_hpp_
+#include "vulkan_wrapper_types.hpp"
+#include <vector>
+#include <ostream>
+#include <string.h>
+#include <map>
+#include "../../../test_common/harness/testHarness.h"
+#define STRING_(str) #str
+#define STRING(str) STRING_(str)
+#define ROUND_UP(n, multiple)                                                  \
+    (((n) + (multiple)-1) - ((((n) + (multiple)-1)) % (multiple)))
+const VulkanInstance& getVulkanInstance();
+const VulkanPhysicalDevice& getVulkanPhysicalDevice();
+const VulkanQueueFamily&
+getVulkanQueueFamily(uint32_t queueFlags = VULKAN_QUEUE_FLAG_MASK_ALL);
+const VulkanMemoryType&
+getVulkanMemoryType(const VulkanDevice& device,
+                    VulkanMemoryTypeProperty memoryTypeProperty);
+bool checkVkSupport();
+const VulkanQueueFamilyList& getEmptyVulkanQueueFamilyList();
+const VulkanDescriptorSetLayoutList& getEmptyVulkanDescriptorSetLayoutList();
+const VulkanQueueFamilyToQueueCountMap&
+const std::vector<VulkanExternalMemoryHandleType>
+const std::vector<VulkanExternalSemaphoreHandleType>
+const std::vector<VulkanFormat> getSupportedVulkanFormatList();
+uint32_t getVulkanFormatElementSize(VulkanFormat format);
+const char* getVulkanFormatGLSLFormat(VulkanFormat format);
+const char* getVulkanFormatGLSLTypePrefix(VulkanFormat format);
+std::string prepareVulkanShader(
+    std::string shaderCode,
+    const std::map<std::string, std::string>& patternToSubstituteMap);
+std::ostream& operator<<(std::ostream& os,
+                         VulkanMemoryTypeProperty memoryTypeProperty);
+operator<<(std::ostream& os,
+           VulkanExternalMemoryHandleType externalMemoryHandleType);
+operator<<(std::ostream& os,
+           VulkanExternalSemaphoreHandleType externalSemaphoreHandleType);
+std::ostream& operator<<(std::ostream& os, VulkanFormat format);
+std::vector<char> readFile(const std::string& filename);
+#endif // _vulkan_utility_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp
new file mode 100644
index 0000000..6209a74
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp
@@ -0,0 +1,2072 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef _WIN32
+#define NOMINMAX
+#include <Windows.h>
+#include <dxgi1_2.h>
+#include <aclapi.h>
+#include <vulkan/vulkan.h>
+#include "vulkan_wrapper.hpp"
+#if defined(__linux__) && !defined(__ANDROID__)
+#include <gnu/libc-version.h>
+#include <dlfcn.h>
+#elif defined(__ANDROID__)
+#include <dlfcn.h>
+#if defined _WIN32
+#define LoadFunction GetProcAddress
+#elif defined __linux
+#define LoadFunction dlsym
+extern "C" {
+#define VK_FUNC_DECL(name) PFN_##name _##name = NULL;
+#if defined(_WIN32) || defined(_WIN64)
+#undef VK_FUNC_DECL
+#define WAIVED 2
+#define HANDLE_ERROR -1
+#define CHECK_VK(call)                                                         \
+    if (call != VK_SUCCESS) return call;
+// VulkanInstance implementation //
+VulkanInstance::VulkanInstance(const VulkanInstance &instance)
+    : m_vkInstance(instance.m_vkInstance),
+      m_physicalDeviceList(instance.m_physicalDeviceList)
+VulkanInstance::VulkanInstance(): m_vkInstance(VK_NULL_HANDLE)
+#if defined(__linux__) && !defined(__ANDROID__)
+    char *glibcVersion = strdup(gnu_get_libc_version());
+    int majNum = (int)atoi(strtok(glibcVersion, "."));
+    int minNum = (int)atoi(strtok(NULL, "."));
+    free(glibcVersion);
+    if ((majNum < 2) || (majNum == 2 && minNum < 17))
+    {
+        // WAIVE_TEST() << "Insufficient GLIBC version. Test waived!";
+    }
+#if defined(_WIN32) || defined(_WIN64)
+    const char *vulkanLoaderLibraryName = "vulkan-1.dll";
+#elif defined(__linux__)
+    const char *vulkanLoaderLibraryName = "";
+#ifdef _WIN32
+    hDLL = LoadLibrary(vulkanLoaderLibraryName);
+    if (hDLL == NULL)
+    {
+        throw std::runtime_error("LoadLibrary failed!");
+    }
+    vkGetInstanceProcAddr =
+        (PFN_vkGetInstanceProcAddr)LoadFunction(hDLL, "vkGetInstanceProcAddr");
+#if !defined(__APPLE__)
+    void *handle;
+    handle = dlopen(vulkanLoaderLibraryName, RTLD_LAZY);
+    if (!handle)
+    {
+        fputs(dlerror(), stderr);
+        throw std::runtime_error("dlopen failed !!!");
+    }
+    vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)LoadFunction(
+        handle, "vkGetInstanceProcAddr");
+    if ((unsigned long long)vkGetInstanceProcAddr == (unsigned long long)NULL)
+    {
+        throw std::runtime_error("vkGetInstanceProcAddr() not found!");
+    }
+#define VK_GET_NULL_INSTANCE_PROC_ADDR(name)                                   \
+    _##name = (PFN_##name)vkGetInstanceProcAddr(NULL, #name);
+    if ((unsigned long long)vkGetInstanceProcAddr == (unsigned long long)NULL)
+    {
+        throw std::runtime_error("Couldn't obtain address for function");
+    }
+    VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceExtensionProperties);
+    uint32_t instanceExtensionPropertiesCount;
+    VkResult vkStatus = VK_SUCCESS;
+    vkStatus = vkEnumerateInstanceExtensionProperties(
+        NULL, &instanceExtensionPropertiesCount, NULL);
+    // Something went wrong in vulkan initialization (most likely incompatible
+    // device/driver combination)
+    {
+        throw std::runtime_error(
+            "Waiving vulkan test because "
+            "vkEnumerateInstanceExtensionProperties failed.");
+        // return WAIVED;
+    }
+    VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceVersion);
+    VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceLayerProperties);
+    VK_GET_NULL_INSTANCE_PROC_ADDR(vkCreateInstance);
+    VkApplicationInfo vkApplicationInfo = {};
+    vkApplicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    vkApplicationInfo.pNext = NULL;
+    vkApplicationInfo.pApplicationName = "Default app";
+    vkApplicationInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+    vkApplicationInfo.pEngineName = "No engine";
+    vkApplicationInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+    vkApplicationInfo.apiVersion = VK_API_VERSION_1_0;
+    std::vector<const char *> enabledExtensionNameList;
+    enabledExtensionNameList.push_back(
+    enabledExtensionNameList.push_back(
+    enabledExtensionNameList.push_back(
+    std::vector<VkExtensionProperties> vkExtensionPropertiesList(
+        instanceExtensionPropertiesCount);
+    vkEnumerateInstanceExtensionProperties(NULL,
+                                           &instanceExtensionPropertiesCount,
+                                 ;
+    for (size_t eenIdx = 0; eenIdx < enabledExtensionNameList.size(); eenIdx++)
+    {
+        bool isSupported = false;
+        for (size_t epIdx = 0; epIdx < vkExtensionPropertiesList.size();
+             epIdx++)
+        {
+            if (!strcmp(enabledExtensionNameList[eenIdx],
+                        vkExtensionPropertiesList[epIdx].extensionName))
+            {
+                isSupported = true;
+                break;
+            }
+        }
+        if (!isSupported)
+        {
+            return;
+        }
+    }
+    VkInstanceCreateInfo vkInstanceCreateInfo = {};
+    vkInstanceCreateInfo.pNext = NULL;
+    vkInstanceCreateInfo.flags = 0;
+    vkInstanceCreateInfo.pApplicationInfo = &vkApplicationInfo;
+    vkInstanceCreateInfo.enabledLayerCount = 0;
+    vkInstanceCreateInfo.ppEnabledLayerNames = NULL;
+    vkInstanceCreateInfo.enabledExtensionCount =
+        (uint32_t)enabledExtensionNameList.size();
+    vkInstanceCreateInfo.ppEnabledExtensionNames =
+    vkCreateInstance(&vkInstanceCreateInfo, NULL, &m_vkInstance);
+#define VK_FUNC_DECL(name)                                                     \
+    _##name = (PFN_##name)vkGetInstanceProcAddr(m_vkInstance, #name);          \
+    // ASSERT_NEQ((unsigned long long)name, 0ULL) << "Couldn't obtain address
+    // for function" << #name;
+#if defined(_WIN32) || defined(_WIN64)
+#undef VK_FUNC_DECL
+    uint32_t physicalDeviceCount = 0;
+    vkEnumeratePhysicalDevices(m_vkInstance, &physicalDeviceCount, NULL);
+    // CHECK_NEQ(physicalDeviceCount, uint32_t(0));
+    if (physicalDeviceCount == uint32_t(0))
+    {
+        std::cout << "failed to find GPUs with Vulkan support!\n";
+        return;
+    }
+    std::vector<VkPhysicalDevice> vkPhysicalDeviceList(physicalDeviceCount,
+                                                       VK_NULL_HANDLE);
+    vkEnumeratePhysicalDevices(m_vkInstance, &physicalDeviceCount,
+                     ;
+    for (size_t ppdIdx = 0; ppdIdx < vkPhysicalDeviceList.size(); ppdIdx++)
+    {
+        VulkanPhysicalDevice *physicalDevice =
+            new VulkanPhysicalDevice(vkPhysicalDeviceList[ppdIdx]);
+        m_physicalDeviceList.add(*physicalDevice);
+    }
+    for (size_t pdIdx = 0; pdIdx < m_physicalDeviceList.size(); pdIdx++)
+    {
+        const VulkanPhysicalDevice &physicalDevice =
+            m_physicalDeviceList[pdIdx];
+        delete &physicalDevice;
+    }
+    if (m_vkInstance)
+    {
+        vkDestroyInstance(m_vkInstance, NULL);
+    }
+const VulkanPhysicalDeviceList &VulkanInstance::getPhysicalDeviceList() const
+    return m_physicalDeviceList;
+VulkanInstance::operator VkInstance() const { return m_vkInstance; }
+// VulkanPhysicalDevice implementation //
+    const VulkanPhysicalDevice &physicalDevice)
+    : m_vkPhysicalDevice(physicalDevice.m_vkPhysicalDevice),
+      m_vkPhysicalDeviceProperties(physicalDevice.m_vkPhysicalDeviceProperties),
+      m_vkDeviceNodeMask(physicalDevice.m_vkDeviceNodeMask),
+      m_vkPhysicalDeviceFeatures(physicalDevice.m_vkPhysicalDeviceFeatures),
+      m_vkPhysicalDeviceMemoryProperties(
+          physicalDevice.m_vkPhysicalDeviceMemoryProperties),
+      m_queueFamilyList(physicalDevice.m_queueFamilyList)
+    memcpy(m_vkDeviceUUID, physicalDevice.m_vkDeviceUUID, VK_UUID_SIZE);
+VulkanPhysicalDevice::VulkanPhysicalDevice(VkPhysicalDevice vkPhysicalDevice)
+    : m_vkPhysicalDevice(vkPhysicalDevice)
+    if (m_vkPhysicalDevice == (VkPhysicalDevice)VK_NULL_HANDLE)
+    {
+        throw std::runtime_error("failed to find a suitable GPU!");
+    }
+    vkGetPhysicalDeviceProperties(m_vkPhysicalDevice,
+                                  &m_vkPhysicalDeviceProperties);
+    vkGetPhysicalDeviceFeatures(m_vkPhysicalDevice,
+                                &m_vkPhysicalDeviceFeatures);
+    VkPhysicalDeviceIDPropertiesKHR vkPhysicalDeviceIDPropertiesKHR = {};
+    vkPhysicalDeviceIDPropertiesKHR.sType =
+    vkPhysicalDeviceIDPropertiesKHR.pNext = NULL;
+    VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {};
+    vkPhysicalDeviceProperties2KHR.sType =
+    vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR;
+    vkGetPhysicalDeviceProperties2KHR(m_vkPhysicalDevice,
+                                      &vkPhysicalDeviceProperties2KHR);
+    memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID,
+           sizeof(m_vkDeviceUUID));
+    memcpy(m_vkDeviceLUID, vkPhysicalDeviceIDPropertiesKHR.deviceLUID,
+           sizeof(m_vkDeviceLUID));
+    m_vkDeviceNodeMask = vkPhysicalDeviceIDPropertiesKHR.deviceNodeMask;
+    uint32_t queueFamilyCount = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(m_vkPhysicalDevice,
+                                             &queueFamilyCount, NULL);
+    std::vector<VkQueueFamilyProperties> vkQueueFamilyPropertiesList(
+        queueFamilyCount);
+    vkGetPhysicalDeviceQueueFamilyProperties(
+        m_vkPhysicalDevice, &queueFamilyCount,
+    for (size_t qfpIdx = 0; qfpIdx < vkQueueFamilyPropertiesList.size();
+         qfpIdx++)
+    {
+        VulkanQueueFamily *queueFamily = new VulkanQueueFamily(
+            uint32_t(qfpIdx), vkQueueFamilyPropertiesList[qfpIdx]);
+        m_queueFamilyList.add(*queueFamily);
+    }
+    vkGetPhysicalDeviceMemoryProperties(m_vkPhysicalDevice,
+                                        &m_vkPhysicalDeviceMemoryProperties);
+    for (uint32_t mhIdx = 0;
+         mhIdx < m_vkPhysicalDeviceMemoryProperties.memoryHeapCount; mhIdx++)
+    {
+        VulkanMemoryHeap *memoryHeap = new VulkanMemoryHeap(
+            mhIdx, m_vkPhysicalDeviceMemoryProperties.memoryHeaps[mhIdx].size,
+            (VulkanMemoryHeapFlag)m_vkPhysicalDeviceMemoryProperties
+                .memoryHeaps[mhIdx]
+                .flags);
+        m_memoryHeapList.add(*memoryHeap);
+    }
+    for (uint32_t mtIdx = 0;
+         mtIdx < m_vkPhysicalDeviceMemoryProperties.memoryTypeCount; mtIdx++)
+    {
+        const VulkanMemoryHeap &memoryHeap = m_memoryHeapList
+            [m_vkPhysicalDeviceMemoryProperties.memoryTypes[mtIdx].heapIndex];
+        VulkanMemoryType *memoryType = new VulkanMemoryType(
+            mtIdx,
+            (VulkanMemoryTypeProperty)m_vkPhysicalDeviceMemoryProperties
+                .memoryTypes[mtIdx]
+                .propertyFlags,
+            memoryHeap);
+        m_memoryTypeList.add(*memoryType);
+    }
+    for (size_t mtIdx = 0; mtIdx < m_memoryTypeList.size(); mtIdx++)
+    {
+        const VulkanMemoryType &memoryType = m_memoryTypeList[mtIdx];
+        delete &memoryType;
+    }
+    for (size_t mhIdx = 0; mhIdx < m_memoryHeapList.size(); mhIdx++)
+    {
+        const VulkanMemoryHeap &memoryHeap = m_memoryHeapList[mhIdx];
+        delete &memoryHeap;
+    }
+    for (size_t qfIdx = 0; qfIdx < m_queueFamilyList.size(); qfIdx++)
+    {
+        const VulkanQueueFamily &queueFamily = m_queueFamilyList[qfIdx];
+        delete &queueFamily;
+    }
+const VulkanQueueFamilyList &VulkanPhysicalDevice::getQueueFamilyList() const
+    return m_queueFamilyList;
+const VulkanMemoryHeapList &VulkanPhysicalDevice::getMemoryHeapList() const
+    return m_memoryHeapList;
+const VulkanMemoryTypeList &VulkanPhysicalDevice::getMemoryTypeList() const
+    return m_memoryTypeList;
+const uint8_t *VulkanPhysicalDevice::getUUID() const { return m_vkDeviceUUID; }
+const uint8_t *VulkanPhysicalDevice::getLUID() const { return m_vkDeviceLUID; }
+uint32_t VulkanPhysicalDevice::getNodeMask() const
+    return m_vkDeviceNodeMask;
+VulkanPhysicalDevice::operator VkPhysicalDevice() const
+    return m_vkPhysicalDevice;
+bool operator<(const VulkanQueueFamily &queueFamilyA,
+               const VulkanQueueFamily &queueFamilyB)
+    return (uint32_t)queueFamilyA < (uint32_t)queueFamilyB;
+// VulkanMemoryHeap implementation //
+VulkanMemoryHeap::VulkanMemoryHeap(const VulkanMemoryHeap &memoryHeap)
+    : m_memoryHeapIndex(memoryHeap.m_memoryHeapIndex),
+      m_size(memoryHeap.m_size), m_memoryHeapFlag(memoryHeap.m_memoryHeapFlag)
+VulkanMemoryHeap::VulkanMemoryHeap(uint32_t memoryHeapIndex, uint64_t size,
+                                   VulkanMemoryHeapFlag memoryHeapFlag)
+    : m_memoryHeapIndex(memoryHeapIndex), m_size(size),
+      m_memoryHeapFlag(memoryHeapFlag)
+VulkanMemoryHeap::~VulkanMemoryHeap() {}
+uint64_t VulkanMemoryHeap::getSize() const { return m_size; }
+VulkanMemoryHeapFlag VulkanMemoryHeap::getMemoryHeapFlag() const
+    return m_memoryHeapFlag;
+VulkanMemoryHeap::operator uint32_t() const { return m_memoryHeapIndex; }
+// VulkanMemoryType implementation //
+VulkanMemoryType::VulkanMemoryType(const VulkanMemoryType &memoryType)
+    : m_memoryTypeIndex(memoryType.m_memoryTypeIndex),
+      m_memoryTypeProperty(memoryType.m_memoryTypeProperty),
+      m_memoryHeap(memoryType.m_memoryHeap)
+VulkanMemoryType::VulkanMemoryType(uint32_t memoryTypeIndex,
+                                   VulkanMemoryTypeProperty memoryTypeProperty,
+                                   const VulkanMemoryHeap &memoryHeap)
+    : m_memoryTypeIndex(memoryTypeIndex),
+      m_memoryTypeProperty(memoryTypeProperty), m_memoryHeap(memoryHeap)
+VulkanMemoryType::~VulkanMemoryType() {}
+VulkanMemoryTypeProperty VulkanMemoryType::getMemoryTypeProperty() const
+    return m_memoryTypeProperty;
+const VulkanMemoryHeap &VulkanMemoryType::getMemoryHeap() const
+    return m_memoryHeap;
+VulkanMemoryType::operator uint32_t() const { return m_memoryTypeIndex; }
+// VulkanQueueFamily implementation //
+VulkanQueueFamily::VulkanQueueFamily(const VulkanQueueFamily &queueFamily)
+    : m_queueFamilyIndex(queueFamily.m_queueFamilyIndex),
+      m_vkQueueFamilyProperties(queueFamily.m_vkQueueFamilyProperties)
+    uint32_t queueFamilyIndex, VkQueueFamilyProperties vkQueueFamilyProperties)
+    : m_queueFamilyIndex(queueFamilyIndex),
+      m_vkQueueFamilyProperties(vkQueueFamilyProperties)
+VulkanQueueFamily::~VulkanQueueFamily() {}
+uint32_t VulkanQueueFamily::getQueueFlags() const
+    return m_vkQueueFamilyProperties.queueFlags
+        & (uint32_t)VULKAN_QUEUE_FLAG_MASK_ALL;
+uint32_t VulkanQueueFamily::getQueueCount() const
+    return m_vkQueueFamilyProperties.queueCount;
+VulkanQueueFamily::operator uint32_t() const { return m_queueFamilyIndex; }
+// VulkanDevice implementation //
+VulkanDevice::VulkanDevice(const VulkanDevice &device)
+    : m_physicalDevice(device.m_physicalDevice), m_vkDevice(device.m_vkDevice)
+    const VulkanPhysicalDevice &physicalDevice,
+    const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap)
+    : m_physicalDevice(physicalDevice), m_vkDevice(NULL)
+    uint32_t maxQueueCount = 0;
+    for (uint32_t qfIdx = 0;
+         qfIdx < (uint32_t)physicalDevice.getQueueFamilyList().size(); qfIdx++)
+    {
+        maxQueueCount =
+            std::max(maxQueueCount, queueFamilyToQueueCountMap[qfIdx]);
+    }
+    std::vector<VkDeviceQueueCreateInfo> vkDeviceQueueCreateInfoList;
+    std::vector<float> queuePriorities(maxQueueCount);
+    for (uint32_t qfIdx = 0;
+         qfIdx < (uint32_t)physicalDevice.getQueueFamilyList().size(); qfIdx++)
+    {
+        if (queueFamilyToQueueCountMap[qfIdx])
+        {
+            VkDeviceQueueCreateInfo vkDeviceQueueCreateInfo = {};
+            vkDeviceQueueCreateInfo.sType =
+            vkDeviceQueueCreateInfo.pNext = NULL;
+            vkDeviceQueueCreateInfo.flags = 0;
+            vkDeviceQueueCreateInfo.queueFamilyIndex = qfIdx;
+            vkDeviceQueueCreateInfo.queueCount =
+                queueFamilyToQueueCountMap[qfIdx];
+            vkDeviceQueueCreateInfo.pQueuePriorities =;
+            vkDeviceQueueCreateInfoList.push_back(vkDeviceQueueCreateInfo);
+        }
+    }
+    std::vector<const char *> enabledExtensionNameList;
+    enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+    enabledExtensionNameList.push_back(
+#if defined(_WIN32) || defined(_WIN64)
+    enabledExtensionNameList.push_back(
+    enabledExtensionNameList.push_back(
+    enabledExtensionNameList.push_back(
+    enabledExtensionNameList.push_back(
+    VkDeviceCreateInfo vkDeviceCreateInfo = {};
+    vkDeviceCreateInfo.pNext = NULL;
+    vkDeviceCreateInfo.flags = 0;
+    vkDeviceCreateInfo.queueCreateInfoCount =
+        (uint32_t)vkDeviceQueueCreateInfoList.size();
+    vkDeviceCreateInfo.pQueueCreateInfos =;
+    vkDeviceCreateInfo.enabledLayerCount = 0;
+    vkDeviceCreateInfo.ppEnabledLayerNames = NULL;
+    vkDeviceCreateInfo.enabledExtensionCount =
+        (uint32_t)enabledExtensionNameList.size();
+    vkDeviceCreateInfo.ppEnabledExtensionNames =
+    vkDeviceCreateInfo.pEnabledFeatures = NULL;
+    vkCreateDevice(physicalDevice, &vkDeviceCreateInfo, NULL, &m_vkDevice);
+    for (uint32_t qfIdx = 0;
+         qfIdx < (uint32_t)m_physicalDevice.getQueueFamilyList().size();
+         qfIdx++)
+    {
+        VulkanQueueList *queueList = new VulkanQueueList();
+        m_queueFamilyIndexToQueueListMap.insert(qfIdx, *queueList);
+        for (uint32_t qIdx = 0; qIdx < queueFamilyToQueueCountMap[qfIdx];
+             qIdx++)
+        {
+            VkQueue vkQueue;
+            vkGetDeviceQueue(m_vkDevice, qfIdx, qIdx, &vkQueue);
+            VulkanQueue *queue = new VulkanQueue(vkQueue);
+            m_queueFamilyIndexToQueueListMap[qfIdx].add(*queue);
+        }
+    }
+    for (uint32_t qfIdx = 0;
+         qfIdx < (uint32_t)m_physicalDevice.getQueueFamilyList().size();
+         qfIdx++)
+    {
+        for (size_t qIdx = 0;
+             qIdx < m_queueFamilyIndexToQueueListMap[qfIdx].size(); qIdx++)
+        {
+            VulkanQueue &queue = m_queueFamilyIndexToQueueListMap[qfIdx][qIdx];
+            delete &queue;
+        }
+        VulkanQueueList &queueList = m_queueFamilyIndexToQueueListMap[qfIdx];
+        delete &queueList;
+    }
+    vkDestroyDevice(m_vkDevice, NULL);
+const VulkanPhysicalDevice &VulkanDevice::getPhysicalDevice() const
+    return m_physicalDevice;
+VulkanQueue &VulkanDevice::getQueue(const VulkanQueueFamily &queueFamily,
+                                    uint32_t queueIndex)
+    return m_queueFamilyIndexToQueueListMap[queueFamily][queueIndex];
+VulkanDevice::operator VkDevice() const { return m_vkDevice; }
+// VulkanQueue implementation //
+VulkanQueue::VulkanQueue(const VulkanQueue &queue): m_vkQueue(queue.m_vkQueue)
+VulkanQueue::VulkanQueue(VkQueue vkQueue): m_vkQueue(vkQueue) {}
+VulkanQueue::~VulkanQueue() {}
+void VulkanQueue::submit(const VulkanSemaphoreList &waitSemaphoreList,
+                         const VulkanCommandBufferList &commandBufferList,
+                         const VulkanSemaphoreList &signalSemaphoreList)
+    std::vector<VkPipelineStageFlags> vkPipelineStageFlagsList(
+        waitSemaphoreList.size(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    VkSubmitInfo vkSubmitInfo = {};
+    vkSubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    vkSubmitInfo.pNext = NULL;
+    vkSubmitInfo.waitSemaphoreCount = (uint32_t)waitSemaphoreList.size();
+    vkSubmitInfo.pWaitSemaphores = waitSemaphoreList();
+    vkSubmitInfo.pWaitDstStageMask =;
+    vkSubmitInfo.commandBufferCount = (uint32_t)commandBufferList.size();
+    vkSubmitInfo.pCommandBuffers = commandBufferList();
+    vkSubmitInfo.signalSemaphoreCount = (uint32_t)signalSemaphoreList.size();
+    vkSubmitInfo.pSignalSemaphores = signalSemaphoreList();
+    vkQueueSubmit(m_vkQueue, 1, &vkSubmitInfo, NULL);
+void VulkanQueue::submit(const VulkanSemaphore &waitSemaphore,
+                         const VulkanCommandBuffer &commandBuffer,
+                         const VulkanSemaphore &signalSemaphore)
+    VulkanSemaphoreList waitSemaphoreList;
+    VulkanCommandBufferList commandBufferList;
+    VulkanSemaphoreList signalSemaphoreList;
+    waitSemaphoreList.add(waitSemaphore);
+    commandBufferList.add(commandBuffer);
+    signalSemaphoreList.add(signalSemaphore);
+    submit(waitSemaphoreList, commandBufferList, signalSemaphoreList);
+void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer,
+                         const VulkanSemaphore &signalSemaphore)
+    VulkanSemaphoreList waitSemaphoreList;
+    VulkanCommandBufferList commandBufferList;
+    VulkanSemaphoreList signalSemaphoreList;
+    commandBufferList.add(commandBuffer);
+    signalSemaphoreList.add(signalSemaphore);
+    submit(waitSemaphoreList, commandBufferList, signalSemaphoreList);
+void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer)
+    VulkanSemaphoreList waitSemaphoreList;
+    VulkanCommandBufferList commandBufferList;
+    VulkanSemaphoreList signalSemaphoreList;
+    commandBufferList.add(commandBuffer);
+    submit(waitSemaphoreList, commandBufferList, signalSemaphoreList);
+void VulkanQueue::waitIdle() { vkQueueWaitIdle(m_vkQueue); }
+VulkanQueue::operator VkQueue() const { return m_vkQueue; }
+// VulkanDescriptorSetLayoutBinding implementation //
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding)
+    : m_vkDescriptorSetLayoutBinding(
+        descriptorSetLayoutBinding.m_vkDescriptorSetLayoutBinding)
+    uint32_t binding, VulkanDescriptorType descriptorType,
+    uint32_t descriptorCount, VulkanShaderStage shaderStage)
+    m_vkDescriptorSetLayoutBinding.binding = binding;
+    m_vkDescriptorSetLayoutBinding.descriptorType =
+        (VkDescriptorType)descriptorType;
+    m_vkDescriptorSetLayoutBinding.descriptorCount = descriptorCount;
+    m_vkDescriptorSetLayoutBinding.stageFlags =
+        (VkShaderStageFlags)(VkShaderStageFlagBits)shaderStage;
+    m_vkDescriptorSetLayoutBinding.pImmutableSamplers = NULL;
+VulkanDescriptorSetLayoutBinding::~VulkanDescriptorSetLayoutBinding() {}
+VulkanDescriptorSetLayoutBinding::operator VkDescriptorSetLayoutBinding() const
+    return m_vkDescriptorSetLayoutBinding;
+// VulkanDescriptorSetLayout implementation //
+    const VulkanDescriptorSetLayout &descriptorSetLayout)
+    : m_device(descriptorSetLayout.m_device),
+      m_vkDescriptorSetLayout(descriptorSetLayout.m_vkDescriptorSetLayout)
+void VulkanDescriptorSetLayout::VulkanDescriptorSetLayoutCommon(
+    const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList)
+    VkDescriptorSetLayoutCreateInfo vkDescriptorSetLayoutCreateInfo = {};
+    vkDescriptorSetLayoutCreateInfo.sType =
+    vkDescriptorSetLayoutCreateInfo.pNext = NULL;
+    vkDescriptorSetLayoutCreateInfo.flags = 0;
+    vkDescriptorSetLayoutCreateInfo.bindingCount =
+        (uint32_t)descriptorSetLayoutBindingList.size();
+    vkDescriptorSetLayoutCreateInfo.pBindings =
+        descriptorSetLayoutBindingList();
+    vkCreateDescriptorSetLayout(m_device, &vkDescriptorSetLayoutCreateInfo,
+                                NULL, &m_vkDescriptorSetLayout);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding)
+    : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList;
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding);
+    VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1)
+    : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList;
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding0);
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding1);
+    VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList)
+    : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList);
+    if (m_vkDescriptorSetLayout != VK_NULL_HANDLE)
+    {
+        vkDestroyDescriptorSetLayout(m_device, m_vkDescriptorSetLayout, NULL);
+    }
+VulkanDescriptorSetLayout::operator VkDescriptorSetLayout() const
+    return m_vkDescriptorSetLayout;
+// VulkanPipelineLayout implementation //
+    const VulkanPipelineLayout &pipelineLayout)
+    : m_device(pipelineLayout.m_device),
+      m_vkPipelineLayout(pipelineLayout.m_vkPipelineLayout)
+void VulkanPipelineLayout::VulkanPipelineLayoutCommon(
+    const VulkanDescriptorSetLayoutList &descriptorSetLayoutList)
+    VkPipelineLayoutCreateInfo vkPipelineLayoutCreateInfo = {};
+    vkPipelineLayoutCreateInfo.sType =
+    vkPipelineLayoutCreateInfo.pNext = NULL;
+    vkPipelineLayoutCreateInfo.flags = 0;
+    vkPipelineLayoutCreateInfo.setLayoutCount =
+        (uint32_t)descriptorSetLayoutList.size();
+    vkPipelineLayoutCreateInfo.pSetLayouts = descriptorSetLayoutList();
+    vkPipelineLayoutCreateInfo.pushConstantRangeCount = 0;
+    vkPipelineLayoutCreateInfo.pPushConstantRanges = NULL;
+    vkCreatePipelineLayout(m_device, &vkPipelineLayoutCreateInfo, NULL,
+                           &m_vkPipelineLayout);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayout &descriptorSetLayout)
+    : m_device(device), m_vkPipelineLayout(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutList descriptorSetLayoutList;
+    descriptorSetLayoutList.add(descriptorSetLayout);
+    VulkanPipelineLayoutCommon(descriptorSetLayoutList);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutList &descriptorSetLayoutList)
+    : m_device(device), m_vkPipelineLayout(VK_NULL_HANDLE)
+    VulkanPipelineLayoutCommon(descriptorSetLayoutList);
+    vkDestroyPipelineLayout(m_device, m_vkPipelineLayout, NULL);
+VulkanPipelineLayout::operator VkPipelineLayout() const
+    return m_vkPipelineLayout;
+// VulkanShaderModule implementation //
+VulkanShaderModule::VulkanShaderModule(const VulkanShaderModule &shaderModule)
+    : m_device(shaderModule.m_device),
+      m_vkShaderModule(shaderModule.m_vkShaderModule)
+VulkanShaderModule::VulkanShaderModule(const VulkanDevice &device,
+                                       const std::vector<char> &code)
+    : m_device(device)
+    VkShaderModuleCreateInfo vkShaderModuleCreateInfo = {};
+    vkShaderModuleCreateInfo.sType =
+    vkShaderModuleCreateInfo.pNext = NULL;
+    vkShaderModuleCreateInfo.flags = 0;
+    vkShaderModuleCreateInfo.codeSize = code.size();
+    vkShaderModuleCreateInfo.pCode =
+        reinterpret_cast<const uint32_t *>(;
+    vkCreateShaderModule(m_device, &vkShaderModuleCreateInfo, NULL,
+                         &m_vkShaderModule);
+    vkDestroyShaderModule(m_device, m_vkShaderModule, NULL);
+VulkanShaderModule::operator VkShaderModule() const { return m_vkShaderModule; }
+// VulkanPipeline implementation //
+VulkanPipeline::VulkanPipeline(const VulkanPipeline &pipeline)
+    : m_device(pipeline.m_device), m_vkPipeline(pipeline.m_vkPipeline)
+VulkanPipeline::VulkanPipeline(const VulkanDevice &device)
+    : m_device(device), m_vkPipeline(VK_NULL_HANDLE)
+    vkDestroyPipeline(m_device, m_vkPipeline, NULL);
+VulkanPipeline::operator VkPipeline() const { return m_vkPipeline; }
+// VulkanComputePipeline implementation //
+    const VulkanComputePipeline &computePipeline)
+    : VulkanPipeline(computePipeline)
+    const VulkanDevice &device, const VulkanPipelineLayout &pipelineLayout,
+    const VulkanShaderModule &shaderModule, const std::string &entryFuncName)
+    : VulkanPipeline(device)
+    VkPipelineShaderStageCreateInfo vkPipelineShaderStageCreateInfo = {};
+    vkPipelineShaderStageCreateInfo.sType =
+    vkPipelineShaderStageCreateInfo.pNext = NULL;
+    vkPipelineShaderStageCreateInfo.flags = 0;
+    vkPipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    vkPipelineShaderStageCreateInfo.module = shaderModule;
+    vkPipelineShaderStageCreateInfo.pName = entryFuncName.c_str();
+    vkPipelineShaderStageCreateInfo.pSpecializationInfo = NULL;
+    VkComputePipelineCreateInfo vkComputePipelineCreateInfo = {};
+    vkComputePipelineCreateInfo.sType =
+    vkComputePipelineCreateInfo.pNext = NULL;
+    vkComputePipelineCreateInfo.flags = 0;
+    vkComputePipelineCreateInfo.stage = vkPipelineShaderStageCreateInfo;
+    vkComputePipelineCreateInfo.layout = pipelineLayout;
+    vkComputePipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE;
+    vkComputePipelineCreateInfo.basePipelineIndex = 0;
+    vkCreateComputePipelines(device, VK_NULL_HANDLE, 1,
+                             &vkComputePipelineCreateInfo, NULL, &m_vkPipeline);
+VulkanComputePipeline::~VulkanComputePipeline() {}
+VulkanPipelineBindPoint VulkanComputePipeline::getPipelineBindPoint() const
+// VulkanDescriptorPool implementation //
+    const VulkanDescriptorPool &descriptorPool)
+    : m_device(descriptorPool.m_device),
+      m_vkDescriptorPool(descriptorPool.m_vkDescriptorPool)
+void VulkanDescriptorPool::VulkanDescriptorPoolCommon(
+    const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList)
+    if (descriptorSetLayoutBindingList.size())
+    {
+        std::map<VkDescriptorType, uint32_t>
+            vkDescriptorTypeToDescriptorCountMap;
+        for (size_t dslbIdx = 0;
+             dslbIdx < descriptorSetLayoutBindingList.size(); dslbIdx++)
+        {
+            VkDescriptorSetLayoutBinding vkDescriptorSetLayoutBinding =
+                descriptorSetLayoutBindingList[dslbIdx];
+            if (vkDescriptorTypeToDescriptorCountMap.find(
+                    vkDescriptorSetLayoutBinding.descriptorType)
+                == vkDescriptorTypeToDescriptorCountMap.end())
+            {
+                vkDescriptorTypeToDescriptorCountMap
+                    [vkDescriptorSetLayoutBinding.descriptorType] = 1;
+            }
+            else
+            {
+                vkDescriptorTypeToDescriptorCountMap
+                    [vkDescriptorSetLayoutBinding.descriptorType]++;
+            }
+        }
+        std::vector<VkDescriptorPoolSize> vkDescriptorPoolSizeList;
+        std::map<VkDescriptorType, uint32_t>::iterator dtdcIt;
+        for (dtdcIt = vkDescriptorTypeToDescriptorCountMap.begin();
+             dtdcIt != vkDescriptorTypeToDescriptorCountMap.end(); ++dtdcIt)
+        {
+            VkDescriptorPoolSize vkDescriptorPoolSize = {};
+            vkDescriptorPoolSize.type = dtdcIt->first;
+            vkDescriptorPoolSize.descriptorCount = dtdcIt->second;
+            vkDescriptorPoolSizeList.push_back(vkDescriptorPoolSize);
+        }
+        VkDescriptorPoolCreateInfo vkDescriptorPoolCreateInfo = {};
+        vkDescriptorPoolCreateInfo.sType =
+        vkDescriptorPoolCreateInfo.pNext = NULL;
+        vkDescriptorPoolCreateInfo.flags =
+        vkDescriptorPoolCreateInfo.maxSets = 1;
+        vkDescriptorPoolCreateInfo.poolSizeCount =
+            (uint32_t)vkDescriptorPoolSizeList.size();
+        vkDescriptorPoolCreateInfo.pPoolSizes =;
+        vkCreateDescriptorPool(m_device, &vkDescriptorPoolCreateInfo, NULL,
+                               &m_vkDescriptorPool);
+    }
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding)
+    : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList;
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding);
+    VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0,
+    const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1)
+    : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE)
+    VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList;
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding0);
+    descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding1);
+    VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList);
+    const VulkanDevice &device,
+    const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList)
+    : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE)
+    VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList);
+    if (m_vkDescriptorPool != VK_NULL_HANDLE)
+    {
+        vkDestroyDescriptorPool(m_device, m_vkDescriptorPool, NULL);
+    }
+VulkanDescriptorPool::operator VkDescriptorPool() const
+    return m_vkDescriptorPool;
+// VulkanDescriptorSet implementation //
+    const VulkanDescriptorSet &descriptorSet)
+    : m_device(descriptorSet.m_device),
+      m_descriptorPool(descriptorSet.m_descriptorPool),
+      m_vkDescriptorSet(descriptorSet.m_vkDescriptorSet)
+    const VulkanDevice &device, const VulkanDescriptorPool &descriptorPool,
+    const VulkanDescriptorSetLayout &descriptorSetLayout)
+    : m_device(device), m_descriptorPool(descriptorPool),
+      m_vkDescriptorSet(VK_NULL_HANDLE)
+    VkDescriptorSetLayout vkDescriptorSetLayout = descriptorSetLayout;
+    if ((VkDescriptorPool)m_descriptorPool)
+    {
+        VkDescriptorSetAllocateInfo vkDescriptorSetAllocateInfo = {};
+        vkDescriptorSetAllocateInfo.sType =
+        vkDescriptorSetAllocateInfo.pNext = NULL;
+        vkDescriptorSetAllocateInfo.descriptorPool = descriptorPool;
+        vkDescriptorSetAllocateInfo.descriptorSetCount = 1;
+        vkDescriptorSetAllocateInfo.pSetLayouts = &vkDescriptorSetLayout;
+        vkAllocateDescriptorSets(m_device, &vkDescriptorSetAllocateInfo,
+                                 &m_vkDescriptorSet);
+    }
+    if ((VkDescriptorPool)m_descriptorPool)
+    {
+        vkFreeDescriptorSets(m_device, m_descriptorPool, 1, &m_vkDescriptorSet);
+    }
+void VulkanDescriptorSet::update(uint32_t binding, const VulkanBuffer &buffer)
+    VkDescriptorBufferInfo vkDescriptorBufferInfo = {};
+    vkDescriptorBufferInfo.buffer = buffer;
+    vkDescriptorBufferInfo.offset = 0;
+    vkDescriptorBufferInfo.range = VK_WHOLE_SIZE;
+    VkWriteDescriptorSet vkWriteDescriptorSet = {};
+    vkWriteDescriptorSet.pNext = NULL;
+    vkWriteDescriptorSet.dstSet = m_vkDescriptorSet;
+    vkWriteDescriptorSet.dstBinding = binding;
+    vkWriteDescriptorSet.dstArrayElement = 0;
+    vkWriteDescriptorSet.descriptorCount = 1;
+    vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    vkWriteDescriptorSet.pImageInfo = NULL;
+    vkWriteDescriptorSet.pBufferInfo = &vkDescriptorBufferInfo;
+    vkWriteDescriptorSet.pTexelBufferView = NULL;
+    vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
+void VulkanDescriptorSet::update(uint32_t binding,
+                                 const VulkanImageView &imageView)
+    VkDescriptorImageInfo vkDescriptorImageInfo = {};
+    vkDescriptorImageInfo.sampler = VK_NULL_HANDLE;
+    vkDescriptorImageInfo.imageView = imageView;
+    vkDescriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+    VkWriteDescriptorSet vkWriteDescriptorSet = {};
+    vkWriteDescriptorSet.pNext = NULL;
+    vkWriteDescriptorSet.dstSet = m_vkDescriptorSet;
+    vkWriteDescriptorSet.dstBinding = binding;
+    vkWriteDescriptorSet.dstArrayElement = 0;
+    vkWriteDescriptorSet.descriptorCount = 1;
+    vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+    vkWriteDescriptorSet.pImageInfo = &vkDescriptorImageInfo;
+    vkWriteDescriptorSet.pBufferInfo = NULL;
+    vkWriteDescriptorSet.pTexelBufferView = NULL;
+    vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
+VulkanDescriptorSet::operator VkDescriptorSet() const
+    return m_vkDescriptorSet;
+// VulkanOffset3D implementation //
+VulkanOffset3D::VulkanOffset3D(const VulkanOffset3D &offset3D)
+    : m_vkOffset3D(offset3D.m_vkOffset3D)
+VulkanOffset3D::VulkanOffset3D(uint32_t x, uint32_t y, uint32_t z)
+    m_vkOffset3D.x = x;
+    m_vkOffset3D.y = y;
+    m_vkOffset3D.z = z;
+VulkanOffset3D::~VulkanOffset3D() {}
+uint32_t VulkanOffset3D::getX() const { return m_vkOffset3D.x; }
+uint32_t VulkanOffset3D::getY() const { return m_vkOffset3D.y; }
+uint32_t VulkanOffset3D::getZ() const { return m_vkOffset3D.z; }
+VulkanOffset3D::operator VkOffset3D() const { return m_vkOffset3D; }
+// VulkanExtent3D implementation //
+VulkanExtent3D::VulkanExtent3D(const VulkanExtent3D &extent3D)
+    : m_vkExtent3D(extent3D.m_vkExtent3D)
+VulkanExtent3D::VulkanExtent3D(uint32_t width, uint32_t height, uint32_t depth)
+    m_vkExtent3D.width = width;
+    m_vkExtent3D.height = height;
+    m_vkExtent3D.depth = depth;
+VulkanExtent3D::~VulkanExtent3D() {}
+uint32_t VulkanExtent3D::getWidth() const { return m_vkExtent3D.width; }
+uint32_t VulkanExtent3D::getHeight() const { return m_vkExtent3D.height; }
+uint32_t VulkanExtent3D::getDepth() const { return m_vkExtent3D.depth; }
+VulkanExtent3D::operator VkExtent3D() const { return m_vkExtent3D; }
+// VulkanCommandPool implementation //
+VulkanCommandPool::VulkanCommandPool(const VulkanCommandPool &commandPool)
+    : m_device(commandPool.m_device),
+      m_vkCommandPool(commandPool.m_vkCommandPool)
+VulkanCommandPool::VulkanCommandPool(const VulkanDevice &device,
+                                     const VulkanQueueFamily &queueFamily)
+    : m_device(device)
+    VkCommandPoolCreateInfo vkCommandPoolCreateInfo = {};
+    vkCommandPoolCreateInfo.pNext = NULL;
+    vkCommandPoolCreateInfo.flags =
+    vkCommandPoolCreateInfo.queueFamilyIndex = queueFamily;
+    vkCreateCommandPool(m_device, &vkCommandPoolCreateInfo, NULL,
+                        &m_vkCommandPool);
+    vkDestroyCommandPool(m_device, m_vkCommandPool, NULL);
+VulkanCommandPool::operator VkCommandPool() const { return m_vkCommandPool; }
+// VulkanCommandBuffer implementation //
+    const VulkanCommandBuffer &commandBuffer)
+    : m_device(commandBuffer.m_device),
+      m_commandPool(commandBuffer.m_commandPool),
+      m_vkCommandBuffer(commandBuffer.m_vkCommandBuffer)
+VulkanCommandBuffer::VulkanCommandBuffer(const VulkanDevice &device,
+                                         const VulkanCommandPool &commandPool)
+    : m_device(device), m_commandPool(commandPool)
+    VkCommandBufferAllocateInfo vkCommandBufferAllocateInfo = {};
+    vkCommandBufferAllocateInfo.sType =
+    vkCommandBufferAllocateInfo.pNext = NULL;
+    vkCommandBufferAllocateInfo.commandPool = commandPool;
+    vkCommandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    vkCommandBufferAllocateInfo.commandBufferCount = 1;
+    vkAllocateCommandBuffers(m_device, &vkCommandBufferAllocateInfo,
+                             &m_vkCommandBuffer);
+    vkFreeCommandBuffers(m_device, m_commandPool, 1, &m_vkCommandBuffer);
+void VulkanCommandBuffer::begin()
+    VkCommandBufferBeginInfo vkCommandBufferBeginInfo = {};
+    vkCommandBufferBeginInfo.sType =
+    vkCommandBufferBeginInfo.pNext = NULL;
+    vkCommandBufferBeginInfo.flags =
+    vkCommandBufferBeginInfo.pInheritanceInfo = NULL;
+    vkBeginCommandBuffer(m_vkCommandBuffer, &vkCommandBufferBeginInfo);
+void VulkanCommandBuffer::bindPipeline(const VulkanPipeline &pipeline)
+    VkPipelineBindPoint vkPipelineBindPoint =
+        (VkPipelineBindPoint)pipeline.getPipelineBindPoint();
+    vkCmdBindPipeline(m_vkCommandBuffer, vkPipelineBindPoint, pipeline);
+void VulkanCommandBuffer::bindDescriptorSets(
+    const VulkanPipeline &pipeline, const VulkanPipelineLayout &pipelineLayout,
+    const VulkanDescriptorSet &descriptorSet)
+    VkPipelineBindPoint vkPipelineBindPoint =
+        (VkPipelineBindPoint)pipeline.getPipelineBindPoint();
+    VkDescriptorSet vkDescriptorSet = descriptorSet;
+    vkCmdBindDescriptorSets(m_vkCommandBuffer, vkPipelineBindPoint,
+                            pipelineLayout, 0, 1, &vkDescriptorSet, 0, NULL);
+void VulkanCommandBuffer::pipelineBarrier(const VulkanImage2DList &image2DList,
+                                          VulkanImageLayout oldImageLayout,
+                                          VulkanImageLayout newImageLayout)
+    std::vector<VkImageMemoryBarrier> vkImageMemoryBarrierList;
+    for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++)
+    {
+        VkImageSubresourceRange vkImageSubresourceRange = {};
+        vkImageSubresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        vkImageSubresourceRange.baseMipLevel = 0;
+        vkImageSubresourceRange.levelCount = VK_REMAINING_MIP_LEVELS;
+        vkImageSubresourceRange.baseArrayLayer = 0;
+        vkImageSubresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS;
+        VkImageMemoryBarrier vkImageMemoryBarrier = {};
+        vkImageMemoryBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+        vkImageMemoryBarrier.pNext = NULL;
+        vkImageMemoryBarrier.srcAccessMask = 0;
+        vkImageMemoryBarrier.dstAccessMask = 0;
+        vkImageMemoryBarrier.oldLayout = (VkImageLayout)oldImageLayout;
+        vkImageMemoryBarrier.newLayout = (VkImageLayout)newImageLayout;
+        vkImageMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        vkImageMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        vkImageMemoryBarrier.image = image2DList[i2DIdx];
+        vkImageMemoryBarrier.subresourceRange = vkImageSubresourceRange;
+        vkImageMemoryBarrierList.push_back(vkImageMemoryBarrier);
+    }
+    vkCmdPipelineBarrier(m_vkCommandBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                         VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, NULL, 0,
+                         NULL, (uint32_t)vkImageMemoryBarrierList.size(),
+               ;
+void VulkanCommandBuffer::dispatch(uint32_t groupCountX, uint32_t groupCountY,
+                                   uint32_t groupCountZ)
+    vkCmdDispatch(m_vkCommandBuffer, groupCountX, groupCountY, groupCountZ);
+void VulkanCommandBuffer::fillBuffer(const VulkanBuffer &buffer, uint32_t data,
+                                     uint64_t offset, uint64_t size)
+    vkCmdFillBuffer(m_vkCommandBuffer, buffer, offset, size, data);
+void VulkanCommandBuffer::updateBuffer(const VulkanBuffer &buffer, void *pdata,
+                                       uint64_t offset, uint64_t size)
+    vkCmdUpdateBuffer(m_vkCommandBuffer, buffer, offset, size, pdata);
+void VulkanCommandBuffer::copyBufferToImage(const VulkanBuffer &buffer,
+                                            const VulkanImage &image,
+                                            VulkanImageLayout imageLayout)
+    VkDeviceSize bufferOffset = 0;
+    std::vector<VkBufferImageCopy> vkBufferImageCopyList;
+    for (uint32_t mipLevel = 0; mipLevel < image.getNumMipLevels(); mipLevel++)
+    {
+        VulkanExtent3D extent3D = image.getExtent3D(mipLevel);
+        size_t elementSize = getVulkanFormatElementSize(image.getFormat());
+        VkImageSubresourceLayers vkImageSubresourceLayers = {};
+        vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        vkImageSubresourceLayers.mipLevel = mipLevel;
+        vkImageSubresourceLayers.baseArrayLayer = 0;
+        vkImageSubresourceLayers.layerCount = image.getNumLayers();
+        VkBufferImageCopy vkBufferImageCopy = {};
+        vkBufferImageCopy.bufferOffset = bufferOffset;
+        vkBufferImageCopy.bufferRowLength = 0;
+        vkBufferImageCopy.bufferImageHeight = 0;
+        vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers;
+        vkBufferImageCopy.imageOffset = VulkanOffset3D(0, 0, 0);
+        vkBufferImageCopy.imageExtent = extent3D;
+        vkBufferImageCopyList.push_back(vkBufferImageCopy);
+        bufferOffset += extent3D.getWidth() * extent3D.getHeight()
+            * extent3D.getDepth() * elementSize;
+        bufferOffset =
+            ROUND_UP(bufferOffset,
+                     std::max(elementSize,
+                              (size_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT));
+    }
+    vkCmdCopyBufferToImage(
+        m_vkCommandBuffer, buffer, image, (VkImageLayout)imageLayout,
+        (uint32_t)vkBufferImageCopyList.size(),;
+void VulkanCommandBuffer::copyBufferToImage(
+    const VulkanBuffer &buffer, const VulkanImage &image, uint64_t bufferOffset,
+    uint32_t mipLevel, uint32_t baseArrayLayer, uint32_t layerCount,
+    VulkanOffset3D offset3D, VulkanExtent3D extent3D)
+    VkImageSubresourceLayers vkImageSubresourceLayers = {};
+    vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    vkImageSubresourceLayers.mipLevel = mipLevel;
+    vkImageSubresourceLayers.baseArrayLayer = baseArrayLayer;
+    vkImageSubresourceLayers.layerCount = layerCount;
+    VkExtent3D vkExtent3D = extent3D;
+    if ((extent3D.getWidth() == 0) && (extent3D.getHeight() == 0)
+        && (extent3D.getDepth() == 0))
+    {
+        vkExtent3D = image.getExtent3D(mipLevel);
+    }
+    VkBufferImageCopy vkBufferImageCopy = {};
+    vkBufferImageCopy.bufferOffset = bufferOffset;
+    vkBufferImageCopy.bufferRowLength = 0;
+    vkBufferImageCopy.bufferImageHeight = 0;
+    vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers;
+    vkBufferImageCopy.imageOffset = offset3D;
+    vkBufferImageCopy.imageExtent = vkExtent3D;
+    vkCmdCopyBufferToImage(m_vkCommandBuffer, buffer, image,
+                           VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
+                           &vkBufferImageCopy);
+void VulkanCommandBuffer::copyImageToBuffer(
+    const VulkanImage &image, const VulkanBuffer &buffer, uint64_t bufferOffset,
+    uint32_t mipLevel, uint32_t baseArrayLayer, uint32_t layerCount,
+    VulkanOffset3D offset3D, VulkanExtent3D extent3D)
+    VkImageSubresourceLayers vkImageSubresourceLayers = {};
+    vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    vkImageSubresourceLayers.mipLevel = mipLevel;
+    vkImageSubresourceLayers.baseArrayLayer = baseArrayLayer;
+    vkImageSubresourceLayers.layerCount = layerCount;
+    VkExtent3D vkExtent3D = extent3D;
+    if ((extent3D.getWidth() == 0) && (extent3D.getHeight() == 0)
+        && (extent3D.getDepth() == 0))
+    {
+        vkExtent3D = image.getExtent3D(mipLevel);
+    }
+    VkBufferImageCopy vkBufferImageCopy = {};
+    vkBufferImageCopy.bufferOffset = bufferOffset;
+    vkBufferImageCopy.bufferRowLength = 0;
+    vkBufferImageCopy.bufferImageHeight = 0;
+    vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers;
+    vkBufferImageCopy.imageOffset = offset3D;
+    vkBufferImageCopy.imageExtent = vkExtent3D;
+    vkCmdCopyImageToBuffer(m_vkCommandBuffer, image, VK_IMAGE_LAYOUT_GENERAL,
+                           buffer, 1, &vkBufferImageCopy);
+void VulkanCommandBuffer::end() { vkEndCommandBuffer(m_vkCommandBuffer); }
+VulkanCommandBuffer::operator VkCommandBuffer() const
+    return m_vkCommandBuffer;
+// VulkanBuffer implementation //
+VulkanBuffer::VulkanBuffer(const VulkanBuffer &buffer)
+    : m_device(buffer.m_device), m_vkBuffer(buffer.m_vkBuffer),
+      m_size(buffer.m_size), m_alignment(buffer.m_alignment),
+      m_memoryTypeList(buffer.m_memoryTypeList)
+    const VulkanDevice &device, uint64_t size,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanBufferUsage bufferUsage, VulkanSharingMode sharingMode,
+    const VulkanQueueFamilyList &queueFamilyList)
+    : m_device(device), m_vkBuffer(VK_NULL_HANDLE)
+    std::vector<uint32_t> queueFamilyIndexList;
+    if (queueFamilyList.size() == 0)
+    {
+        for (size_t qfIdx = 0;
+             qfIdx < device.getPhysicalDevice().getQueueFamilyList().size();
+             qfIdx++)
+        {
+            queueFamilyIndexList.push_back(
+                device.getPhysicalDevice().getQueueFamilyList()[qfIdx]);
+        }
+    }
+    else
+    {
+        for (size_t qfIdx = 0; qfIdx < queueFamilyList.size(); qfIdx++)
+        {
+            queueFamilyIndexList.push_back(queueFamilyList[qfIdx]);
+        }
+    }
+    VkBufferCreateInfo vkBufferCreateInfo = {};
+    vkBufferCreateInfo.pNext = NULL;
+    vkBufferCreateInfo.flags = 0;
+    vkBufferCreateInfo.size = (VkDeviceSize)size;
+    vkBufferCreateInfo.usage = (VkBufferUsageFlags)bufferUsage;
+    vkBufferCreateInfo.sharingMode = (VkSharingMode)sharingMode;
+    vkBufferCreateInfo.queueFamilyIndexCount =
+        (uint32_t)queueFamilyIndexList.size();
+    vkBufferCreateInfo.pQueueFamilyIndices =;
+    VkExternalMemoryBufferCreateInfo vkExternalMemoryBufferCreateInfo = {};
+    if (externalMemoryHandleType != VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE)
+    {
+        vkExternalMemoryBufferCreateInfo.sType =
+        vkExternalMemoryBufferCreateInfo.pNext = NULL;
+        vkExternalMemoryBufferCreateInfo.handleTypes =
+            (VkExternalMemoryHandleTypeFlags)externalMemoryHandleType;
+        vkBufferCreateInfo.pNext = &vkExternalMemoryBufferCreateInfo;
+    }
+    vkCreateBuffer(m_device, &vkBufferCreateInfo, NULL, &m_vkBuffer);
+    VkMemoryRequirements vkMemoryRequirements = {};
+    vkGetBufferMemoryRequirements(m_device, m_vkBuffer, &vkMemoryRequirements);
+    m_size = vkMemoryRequirements.size;
+    m_alignment = vkMemoryRequirements.alignment;
+    const VulkanMemoryTypeList &memoryTypeList =
+        m_device.getPhysicalDevice().getMemoryTypeList();
+    for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+    {
+        uint32_t memoryTypeIndex = memoryTypeList[mtIdx];
+        if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits)
+        {
+            m_memoryTypeList.add(memoryTypeList[mtIdx]);
+        }
+    }
+VulkanBuffer::~VulkanBuffer() { vkDestroyBuffer(m_device, m_vkBuffer, NULL); }
+uint64_t VulkanBuffer::getSize() const { return m_size; }
+uint64_t VulkanBuffer::getAlignment() const { return m_alignment; }
+const VulkanMemoryTypeList &VulkanBuffer::getMemoryTypeList() const
+    return m_memoryTypeList;
+VulkanBuffer::operator VkBuffer() const { return m_vkBuffer; }
+// VulkanImage implementation //
+VulkanImage::VulkanImage(const VulkanImage &image)
+    : m_device(image.m_device), m_imageType(image.m_imageType),
+      m_extent3D(image.m_extent3D), m_format(image.m_format),
+      m_numMipLevels(image.m_numMipLevels), m_numLayers(image.m_numLayers),
+      m_vkImage(image.m_vkImage), m_size(image.m_size),
+      m_alignment(image.m_alignment), m_memoryTypeList(image.m_memoryTypeList)
+    const VulkanDevice &device, VulkanImageType imageType, VulkanFormat format,
+    const VulkanExtent3D &extent3D, uint32_t numMipLevels, uint32_t arrayLayers,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanImageCreateFlag imageCreateFlag, VulkanImageTiling imageTiling,
+    VulkanImageUsage imageUsage, VulkanSharingMode sharingMode)
+    : m_device(device), m_imageType(imageType), m_extent3D(extent3D),
+      m_format(format), m_numMipLevels(numMipLevels), m_numLayers(arrayLayers),
+      m_vkImage(VK_NULL_HANDLE)
+    VkImageCreateInfo vkImageCreateInfo = {};
+    vkImageCreateInfo.pNext = NULL;
+    vkImageCreateInfo.flags = (VkImageCreateFlags)imageCreateFlag;
+    vkImageCreateInfo.imageType = (VkImageType)imageType;
+    vkImageCreateInfo.format = (VkFormat)format;
+    vkImageCreateInfo.extent = extent3D;
+    vkImageCreateInfo.mipLevels = numMipLevels;
+    vkImageCreateInfo.arrayLayers = arrayLayers;
+    vkImageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+    vkImageCreateInfo.tiling = (VkImageTiling)imageTiling;
+    vkImageCreateInfo.usage = (VkImageUsageFlags)imageUsage;
+    vkImageCreateInfo.sharingMode = (VkSharingMode)sharingMode;
+    vkImageCreateInfo.queueFamilyIndexCount =
+        (uint32_t)m_device.getPhysicalDevice().getQueueFamilyList().size();
+    vkImageCreateInfo.pQueueFamilyIndices =
+        m_device.getPhysicalDevice().getQueueFamilyList()();
+    vkImageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    VkExternalMemoryImageCreateInfo vkExternalMemoryImageCreateInfo = {};
+    if (externalMemoryHandleType != VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE)
+    {
+        vkExternalMemoryImageCreateInfo.sType =
+        vkExternalMemoryImageCreateInfo.pNext = NULL;
+        vkExternalMemoryImageCreateInfo.handleTypes =
+            (VkExternalMemoryHandleTypeFlags)externalMemoryHandleType;
+        vkImageCreateInfo.pNext = &vkExternalMemoryImageCreateInfo;
+    }
+    vkCreateImage(m_device, &vkImageCreateInfo, NULL, &m_vkImage);
+    VulkanImageCreateInfo = vkImageCreateInfo;
+    VkMemoryRequirements vkMemoryRequirements = {};
+    vkGetImageMemoryRequirements(m_device, m_vkImage, &vkMemoryRequirements);
+    m_size = vkMemoryRequirements.size;
+    m_alignment = vkMemoryRequirements.alignment;
+    const VulkanMemoryTypeList &memoryTypeList =
+        m_device.getPhysicalDevice().getMemoryTypeList();
+    for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
+    {
+        uint32_t memoryTypeIndex = memoryTypeList[mtIdx];
+        if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits)
+        {
+            m_memoryTypeList.add(memoryTypeList[mtIdx]);
+        }
+    }
+VulkanImage::~VulkanImage() { vkDestroyImage(m_device, m_vkImage, NULL); }
+VulkanExtent3D VulkanImage::getExtent3D(uint32_t mipLevel) const
+    return VulkanExtent3D(0, 0, 0);
+VulkanFormat VulkanImage::getFormat() const { return m_format; }
+VkImageCreateInfo VulkanImage::getVkImageCreateInfo() const
+    return VulkanImageCreateInfo;
+uint32_t VulkanImage::getNumMipLevels() const { return m_numMipLevels; }
+uint32_t VulkanImage::getNumLayers() const { return m_numLayers; }
+uint64_t VulkanImage::getSize() const { return m_size; }
+uint64_t VulkanImage::getAlignment() const { return m_alignment; }
+const VulkanMemoryTypeList &VulkanImage::getMemoryTypeList() const
+    return m_memoryTypeList;
+VulkanImage::operator VkImage() const { return m_vkImage; }
+// VulkanImage2D implementation //
+VulkanImage2D::VulkanImage2D(const VulkanImage2D &image2D): VulkanImage(image2D)
+    const VulkanDevice &device, VulkanFormat format, uint32_t width,
+    uint32_t height, uint32_t numMipLevels,
+    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
+    VulkanSharingMode sharingMode)
+    : VulkanImage(device, VULKAN_IMAGE_TYPE_2D, format,
+                  VulkanExtent3D(width, height, 1), numMipLevels, 1,
+                  externalMemoryHandleType, imageCreateFlag,
+                  VULKAN_IMAGE_TILING_OPTIMAL, imageUsage, sharingMode)
+VulkanImage2D::~VulkanImage2D() {}
+VulkanExtent3D VulkanImage2D::getExtent3D(uint32_t mipLevel) const
+    uint32_t width = std::max(m_extent3D.getWidth() >> mipLevel, uint32_t(1));
+    uint32_t height = std::max(m_extent3D.getHeight() >> mipLevel, uint32_t(1));
+    uint32_t depth = 1;
+    return VulkanExtent3D(width, height, depth);
+// VulkanImageView implementation //
+VulkanImageView::VulkanImageView(const VulkanImageView &imageView)
+    : m_device(imageView.m_device), m_vkImageView(imageView.m_vkImageView)
+VulkanImageView::VulkanImageView(const VulkanDevice &device,
+                                 const VulkanImage &image,
+                                 VulkanImageViewType imageViewType,
+                                 uint32_t baseMipLevel, uint32_t levelCount,
+                                 uint32_t baseArrayLayer, uint32_t layerCount)
+    : m_device(device), m_vkImageView(VK_NULL_HANDLE)
+    VkComponentMapping vkComponentMapping = {};
+    vkComponentMapping.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+    vkComponentMapping.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+    vkComponentMapping.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+    vkComponentMapping.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+    VkImageSubresourceRange vkImageSubresourceRange = {};
+    vkImageSubresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    vkImageSubresourceRange.baseMipLevel = baseMipLevel;
+    vkImageSubresourceRange.levelCount = levelCount;
+    vkImageSubresourceRange.baseArrayLayer = baseArrayLayer;
+    vkImageSubresourceRange.layerCount = layerCount;
+    VkImageViewCreateInfo vkImageViewCreateInfo = {};
+    vkImageViewCreateInfo.pNext = NULL;
+    vkImageViewCreateInfo.flags = 0;
+    vkImageViewCreateInfo.image = image;
+    vkImageViewCreateInfo.viewType = (VkImageViewType)imageViewType;
+    vkImageViewCreateInfo.format = (VkFormat)image.getFormat();
+    vkImageViewCreateInfo.components = vkComponentMapping;
+    vkImageViewCreateInfo.subresourceRange = vkImageSubresourceRange;
+    vkCreateImageView(m_device, &vkImageViewCreateInfo, NULL, &m_vkImageView);
+    vkDestroyImageView(m_device, m_vkImageView, NULL);
+VulkanImageView::operator VkImageView() const { return m_vkImageView; }
+// VulkanDeviceMemory implementation //
+#if defined(_WIN32) || defined(_WIN64)
+class WindowsSecurityAttributes {
+    SECURITY_ATTRIBUTES m_winSecurityAttributes;
+    PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
+    WindowsSecurityAttributes();
+    SECURITY_ATTRIBUTES *operator&();
+    ~WindowsSecurityAttributes();
+    m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(
+        1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **));
+    // CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL);
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor
+                           + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+    InitializeSecurityDescriptor(m_winPSecurityDescriptor,
+                                 SECURITY_DESCRIPTOR_REVISION);
+    SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority =
+    AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0,
+                             0, 0, 0, 0, 0, 0, ppSID);
+    EXPLICIT_ACCESS explicitAccess;
+    ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
+    explicitAccess.grfAccessPermissions =
+    explicitAccess.grfAccessMode = SET_ACCESS;
+    explicitAccess.grfInheritance = INHERIT_ONLY;
+    explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
+    explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
+    explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID;
+    SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
+    SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
+    m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
+    m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
+    m_winSecurityAttributes.bInheritHandle = TRUE;
+SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&()
+    return &m_winSecurityAttributes;
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor
+                           + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+    if (*ppSID)
+    {
+        FreeSid(*ppSID);
+    }
+    if (*ppACL)
+    {
+        LocalFree(*ppACL);
+    }
+    free(m_winPSecurityDescriptor);
+VulkanDeviceMemory::VulkanDeviceMemory(const VulkanDeviceMemory &deviceMemory)
+    : m_device(deviceMemory.m_device),
+      m_vkDeviceMemory(deviceMemory.m_vkDeviceMemory),
+      m_size(deviceMemory.m_size), m_isDedicated(deviceMemory.m_isDedicated)
+    const VulkanDevice &device, uint64_t size,
+    const VulkanMemoryType &memoryType,
+    VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name)
+    : m_device(device), m_size(size), m_isDedicated(false)
+#if defined(_WIN32) || defined(_WIN64)
+    WindowsSecurityAttributes winSecurityAttributes;
+    VkExportMemoryWin32HandleInfoKHR vkExportMemoryWin32HandleInfoKHR = {};
+    vkExportMemoryWin32HandleInfoKHR.sType =
+    vkExportMemoryWin32HandleInfoKHR.pNext = NULL;
+    vkExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vkExportMemoryWin32HandleInfoKHR.dwAccess =
+ = (LPCWSTR)name;
+    VkExportMemoryAllocateInfoKHR vkExportMemoryAllocateInfoKHR = {};
+    vkExportMemoryAllocateInfoKHR.sType =
+#if defined(_WIN32) || defined(_WIN64)
+    vkExportMemoryAllocateInfoKHR.pNext = externalMemoryHandleType
+        ? &vkExportMemoryWin32HandleInfoKHR
+        : NULL;
+    vkExportMemoryAllocateInfoKHR.pNext = NULL;
+    vkExportMemoryAllocateInfoKHR.handleTypes =
+        (VkExternalMemoryHandleTypeFlagsKHR)externalMemoryHandleType;
+    VkMemoryAllocateInfo vkMemoryAllocateInfo = {};
+    vkMemoryAllocateInfo.pNext =
+        externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    vkMemoryAllocateInfo.allocationSize = m_size;
+    vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType;
+    vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL, &m_vkDeviceMemory);
+    const VulkanDevice &device, const VulkanImage &image,
+    const VulkanMemoryType &memoryType,
+    VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name)
+    : m_device(device), m_size(image.getSize()), m_isDedicated(true)
+#if defined(_WIN32) || defined(_WIN64)
+    WindowsSecurityAttributes winSecurityAttributes;
+    VkExportMemoryWin32HandleInfoKHR vkExportMemoryWin32HandleInfoKHR = {};
+    vkExportMemoryWin32HandleInfoKHR.sType =
+    vkExportMemoryWin32HandleInfoKHR.pNext = NULL;
+    vkExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vkExportMemoryWin32HandleInfoKHR.dwAccess =
+ = (LPCWSTR)name;
+    VkExportMemoryAllocateInfoKHR vkExportMemoryAllocateInfoKHR = {};
+    vkExportMemoryAllocateInfoKHR.sType =
+#if defined(_WIN32) || defined(_WIN64)
+    vkExportMemoryAllocateInfoKHR.pNext = externalMemoryHandleType
+        ? &vkExportMemoryWin32HandleInfoKHR
+        : NULL;
+    vkExportMemoryAllocateInfoKHR.pNext = NULL;
+    vkExportMemoryAllocateInfoKHR.handleTypes =
+        (VkExternalMemoryHandleTypeFlagsKHR)externalMemoryHandleType;
+    VkMemoryDedicatedAllocateInfo vkMemoryDedicatedAllocateInfo = {};
+    vkMemoryDedicatedAllocateInfo.sType =
+    vkMemoryDedicatedAllocateInfo.pNext =
+        externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    vkMemoryDedicatedAllocateInfo.image = image;
+    vkMemoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
+    VkMemoryAllocateInfo vkMemoryAllocateInfo = {};
+    vkMemoryAllocateInfo.pNext = &vkMemoryDedicatedAllocateInfo;
+    vkMemoryAllocateInfo.allocationSize = m_size;
+    vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType;
+    vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL, &m_vkDeviceMemory);
+    vkFreeMemory(m_device, m_vkDeviceMemory, NULL);
+uint64_t VulkanDeviceMemory::getSize() const { return m_size; }
+#ifdef _WIN32
+HANDLE VulkanDeviceMemory::getHandle(
+    VulkanExternalMemoryHandleType externalMemoryHandleType) const
+    HANDLE handle;
+    VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
+    vkMemoryGetWin32HandleInfoKHR.sType =
+    vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
+    vkMemoryGetWin32HandleInfoKHR.memory = m_vkDeviceMemory;
+    vkMemoryGetWin32HandleInfoKHR.handleType =
+        (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType;
+    vkGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR,
+                              &handle);
+    return handle;
+int VulkanDeviceMemory::getHandle(
+    VulkanExternalMemoryHandleType externalMemoryHandleType) const
+    if (externalMemoryHandleType
+    {
+        int fd;
+        VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
+        vkMemoryGetFdInfoKHR.pNext = NULL;
+        vkMemoryGetFdInfoKHR.memory = m_vkDeviceMemory;
+        vkMemoryGetFdInfoKHR.handleType =
+        vkGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd);
+        return fd;
+    }
+    return HANDLE_ERROR;
+bool VulkanDeviceMemory::isDedicated() const { return m_isDedicated; }
+void *VulkanDeviceMemory::map(size_t offset, size_t size)
+    void *pData;
+    vkMapMemory(m_device, m_vkDeviceMemory, (VkDeviceSize)offset,
+                (VkDeviceSize)size, 0, &pData);
+    return pData;
+void VulkanDeviceMemory::unmap() { vkUnmapMemory(m_device, m_vkDeviceMemory); }
+void VulkanDeviceMemory::bindBuffer(const VulkanBuffer &buffer, uint64_t offset)
+    vkBindBufferMemory(m_device, buffer, m_vkDeviceMemory, offset);
+void VulkanDeviceMemory::bindImage(const VulkanImage &image, uint64_t offset)
+    vkBindImageMemory(m_device, image, m_vkDeviceMemory, offset);
+VulkanDeviceMemory::operator VkDeviceMemory() const { return m_vkDeviceMemory; }
+// VulkanSemaphore implementation //
+VulkanSemaphore::VulkanSemaphore(const VulkanSemaphore &semaphore)
+    : m_device(semaphore.m_device), m_vkSemaphore(semaphore.m_vkSemaphore)
+    const VulkanDevice &device,
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+    const std::wstring name)
+    : m_device(device), m_name(name)
+#if defined(_WIN32) || defined(_WIN64)
+    WindowsSecurityAttributes winSecurityAttributes;
+    VkExportSemaphoreWin32HandleInfoKHR
+        vkExportSemaphoreWin32HandleInfoKHR = {};
+    vkExportSemaphoreWin32HandleInfoKHR.sType =
+    vkExportSemaphoreWin32HandleInfoKHR.pNext = NULL;
+    vkExportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vkExportSemaphoreWin32HandleInfoKHR.dwAccess =
+ =
+        m_name.size() ? (LPCWSTR)m_name.c_str() : NULL;
+    VkExportSemaphoreCreateInfoKHR vkExportSemaphoreCreateInfoKHR = {};
+    vkExportSemaphoreCreateInfoKHR.sType =
+#if defined(_WIN32) || defined(_WIN64)
+    vkExportSemaphoreCreateInfoKHR.pNext =
+        (externalSemaphoreHandleType
+        ? &vkExportSemaphoreWin32HandleInfoKHR
+        : NULL;
+    vkExportSemaphoreCreateInfoKHR.pNext = NULL;
+    vkExportSemaphoreCreateInfoKHR.handleTypes =
+        (VkExternalSemaphoreHandleTypeFlagsKHR)externalSemaphoreHandleType;
+    VkSemaphoreCreateInfo vkSemaphoreCreateInfo = {};
+    vkSemaphoreCreateInfo.pNext =
+        (externalSemaphoreHandleType
+        ? &vkExportSemaphoreCreateInfoKHR
+        : NULL;
+    vkSemaphoreCreateInfo.flags = 0;
+    vkCreateSemaphore(m_device, &vkSemaphoreCreateInfo, NULL, &m_vkSemaphore);
+    vkDestroySemaphore(m_device, m_vkSemaphore, NULL);
+#if defined(_WIN32) || defined(_WIN64)
+HANDLE VulkanSemaphore::getHandle(
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const
+    HANDLE handle;
+    VkSemaphoreGetWin32HandleInfoKHR vkSemaphoreGetWin32HandleInfoKHR = {};
+    vkSemaphoreGetWin32HandleInfoKHR.sType =
+    vkSemaphoreGetWin32HandleInfoKHR.pNext = NULL;
+    vkSemaphoreGetWin32HandleInfoKHR.semaphore = m_vkSemaphore;
+    vkSemaphoreGetWin32HandleInfoKHR.handleType =
+        (VkExternalSemaphoreHandleTypeFlagBitsKHR)externalSemaphoreHandleType;
+    vkGetSemaphoreWin32HandleKHR(m_device, &vkSemaphoreGetWin32HandleInfoKHR,
+                                 &handle);
+    return handle;
+int VulkanSemaphore::getHandle(
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const
+    if (externalSemaphoreHandleType
+    {
+        int fd;
+        VkSemaphoreGetFdInfoKHR vkSemaphoreGetFdInfoKHR = {};
+        vkSemaphoreGetFdInfoKHR.sType =
+        vkSemaphoreGetFdInfoKHR.pNext = NULL;
+        vkSemaphoreGetFdInfoKHR.semaphore = m_vkSemaphore;
+        vkSemaphoreGetFdInfoKHR.handleType =
+        vkGetSemaphoreFdKHR(m_device, &vkSemaphoreGetFdInfoKHR, &fd);
+        return fd;
+    }
+    return HANDLE_ERROR;
+const std::wstring &VulkanSemaphore::getName() const { return m_name; }
+VulkanSemaphore::operator VkSemaphore() const { return m_vkSemaphore; }
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp
new file mode 100644
index 0000000..37925ee
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp
@@ -0,0 +1,580 @@
+// Copyright (c) 2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _vulkan_wrapper_hpp_
+#define _vulkan_wrapper_hpp_
+#include <vulkan/vulkan.h>
+#include "vulkan_wrapper_types.hpp"
+#include "vulkan_list_map.hpp"
+#include "vulkan_api_list.hpp"
+class VulkanInstance {
+    friend const VulkanInstance &getVulkanInstance();
+    VkInstance m_vkInstance;
+    VulkanPhysicalDeviceList m_physicalDeviceList;
+    VulkanInstance();
+    VulkanInstance(const VulkanInstance &);
+    virtual ~VulkanInstance();
+    const VulkanPhysicalDeviceList &getPhysicalDeviceList() const;
+    operator VkInstance() const;
+class VulkanPhysicalDevice {
+    friend class VulkanInstance;
+    VkPhysicalDevice m_vkPhysicalDevice;
+    VkPhysicalDeviceProperties m_vkPhysicalDeviceProperties;
+    uint8_t m_vkDeviceUUID[VK_UUID_SIZE];
+    uint8_t m_vkDeviceLUID[VK_LUID_SIZE];
+    uint32_t m_vkDeviceNodeMask;
+    VkPhysicalDeviceFeatures m_vkPhysicalDeviceFeatures;
+    VkPhysicalDeviceMemoryProperties m_vkPhysicalDeviceMemoryProperties;
+    VulkanQueueFamilyList m_queueFamilyList;
+    VulkanMemoryHeapList m_memoryHeapList;
+    VulkanMemoryTypeList m_memoryTypeList;
+    VulkanPhysicalDevice(const VulkanPhysicalDevice &physicalDevice);
+    VulkanPhysicalDevice(VkPhysicalDevice vkPhysicalDevice);
+    virtual ~VulkanPhysicalDevice();
+    const VulkanQueueFamilyList &getQueueFamilyList() const;
+    const VulkanMemoryHeapList &getMemoryHeapList() const;
+    const VulkanMemoryTypeList &getMemoryTypeList() const;
+    const uint8_t *getUUID() const;
+    const uint8_t *getLUID() const;
+    uint32_t getNodeMask() const;
+    operator VkPhysicalDevice() const;
+class VulkanMemoryHeap {
+    friend class VulkanPhysicalDevice;
+    uint32_t m_memoryHeapIndex;
+    uint64_t m_size;
+    VulkanMemoryHeapFlag m_memoryHeapFlag;
+    VulkanMemoryHeap(const VulkanMemoryHeap &memoryHeap);
+    VulkanMemoryHeap(uint32_t m_memoryHeapIndex, uint64_t m_size,
+                     VulkanMemoryHeapFlag m_memoryHeapFlag);
+    virtual ~VulkanMemoryHeap();
+    uint64_t getSize() const;
+    VulkanMemoryHeapFlag getMemoryHeapFlag() const;
+    operator uint32_t() const;
+class VulkanMemoryType {
+    friend class VulkanPhysicalDevice;
+    uint32_t m_memoryTypeIndex;
+    const VulkanMemoryTypeProperty m_memoryTypeProperty;
+    const VulkanMemoryHeap &m_memoryHeap;
+    VulkanMemoryType(const VulkanMemoryType &memoryType);
+    VulkanMemoryType(uint32_t memoryTypeIndex,
+                     VulkanMemoryTypeProperty memoryTypeProperty,
+                     const VulkanMemoryHeap &memoryHeap);
+    virtual ~VulkanMemoryType();
+    VulkanMemoryTypeProperty getMemoryTypeProperty() const;
+    const VulkanMemoryHeap &getMemoryHeap() const;
+    operator uint32_t() const;
+class VulkanQueueFamily {
+    friend class VulkanPhysicalDevice;
+    uint32_t m_queueFamilyIndex;
+    VkQueueFamilyProperties m_vkQueueFamilyProperties;
+    VulkanQueueFamily(const VulkanQueueFamily &queueFamily);
+    VulkanQueueFamily(uint32_t queueFamilyIndex,
+                      VkQueueFamilyProperties vkQueueFamilyProperties);
+    virtual ~VulkanQueueFamily();
+    uint32_t getQueueFlags() const;
+    uint32_t getQueueCount() const;
+    operator uint32_t() const;
+class VulkanDevice {
+    const VulkanPhysicalDevice &m_physicalDevice;
+    VkDevice m_vkDevice;
+    VulkanQueueFamilyToQueueListMap m_queueFamilyIndexToQueueListMap;
+    VulkanDevice(const VulkanDevice &device);
+    VulkanDevice(
+        const VulkanPhysicalDevice &physicalDevice = getVulkanPhysicalDevice(),
+        const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap =
+            getDefaultVulkanQueueFamilyToQueueCountMap());
+    virtual ~VulkanDevice();
+    const VulkanPhysicalDevice &getPhysicalDevice() const;
+    VulkanQueue &
+    getQueue(const VulkanQueueFamily &queueFamily = getVulkanQueueFamily(),
+             uint32_t queueIndex = 0);
+    operator VkDevice() const;
+class VulkanQueue {
+    friend class VulkanDevice;
+    VkQueue m_vkQueue;
+    VulkanQueue(VkQueue vkQueue);
+    VulkanQueue(const VulkanQueue &queue);
+    virtual ~VulkanQueue();
+    const VulkanQueueFamily &getQueueFamily();
+    void submit(const VulkanSemaphoreList &waitSemaphoreList,
+                const VulkanCommandBufferList &commandBufferList,
+                const VulkanSemaphoreList &signalSemaphoreList);
+    void submit(const VulkanSemaphore &waitSemaphore,
+                const VulkanCommandBuffer &commandBuffer,
+                const VulkanSemaphore &signalSemaphore);
+    void submit(const VulkanCommandBuffer &commandBuffer,
+                const VulkanSemaphore &signalSemaphore);
+    void submit(const VulkanCommandBuffer &commandBuffer);
+    void waitIdle();
+    operator VkQueue() const;
+class VulkanDescriptorSetLayoutBinding {
+    VkDescriptorSetLayoutBinding m_vkDescriptorSetLayoutBinding;
+    VulkanDescriptorSetLayoutBinding(
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding);
+    VulkanDescriptorSetLayoutBinding(
+        uint32_t binding, VulkanDescriptorType descriptorType,
+        uint32_t descriptorCount = 1,
+        VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE);
+    virtual ~VulkanDescriptorSetLayoutBinding();
+    operator VkDescriptorSetLayoutBinding() const;
+class VulkanDescriptorSetLayout {
+    const VulkanDevice &m_device;
+    VkDescriptorSetLayout m_vkDescriptorSetLayout;
+    VulkanDescriptorSetLayout(
+        const VulkanDescriptorSetLayout &descriptorSetLayout);
+    void
+    VulkanDescriptorSetLayoutCommon(const VulkanDescriptorSetLayoutBindingList
+                                        &descriptorSetLayoutBindingList);
+    VulkanDescriptorSetLayout(
+        const VulkanDevice &device,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding);
+    VulkanDescriptorSetLayout(
+        const VulkanDevice &device,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1);
+    VulkanDescriptorSetLayout(const VulkanDevice &device,
+                              const VulkanDescriptorSetLayoutBindingList
+                                  &descriptorSetLayoutBindingList);
+    virtual ~VulkanDescriptorSetLayout();
+    operator VkDescriptorSetLayout() const;
+class VulkanPipelineLayout {
+    const VulkanDevice &m_device;
+    VkPipelineLayout m_vkPipelineLayout;
+    VulkanPipelineLayout(const VulkanPipelineLayout &pipelineLayout);
+    void VulkanPipelineLayoutCommon(
+        const VulkanDescriptorSetLayoutList &descriptorSetLayoutList);
+    VulkanPipelineLayout(const VulkanDevice &device,
+                         const VulkanDescriptorSetLayout &descriptorSetLayout);
+    VulkanPipelineLayout(
+        const VulkanDevice &device,
+        const VulkanDescriptorSetLayoutList &descriptorSetLayoutList =
+            getEmptyVulkanDescriptorSetLayoutList());
+    virtual ~VulkanPipelineLayout();
+    operator VkPipelineLayout() const;
+class VulkanShaderModule {
+    const VulkanDevice &m_device;
+    VkShaderModule m_vkShaderModule;
+    VulkanShaderModule(const VulkanShaderModule &shaderModule);
+    VulkanShaderModule(const VulkanDevice &device,
+                       const std::vector<char> &code);
+    virtual ~VulkanShaderModule();
+    operator VkShaderModule() const;
+class VulkanPipeline {
+    const VulkanDevice &m_device;
+    VkPipeline m_vkPipeline;
+    VulkanPipeline(const VulkanPipeline &pipeline);
+    VulkanPipeline(const VulkanDevice &device);
+    virtual ~VulkanPipeline();
+    virtual VulkanPipelineBindPoint getPipelineBindPoint() const = 0;
+    operator VkPipeline() const;
+class VulkanComputePipeline : public VulkanPipeline {
+    VulkanComputePipeline(const VulkanComputePipeline &computePipeline);
+    VulkanComputePipeline(const VulkanDevice &device,
+                          const VulkanPipelineLayout &pipelineLayout,
+                          const VulkanShaderModule &shaderModule,
+                          const std::string &entryFuncName = "main");
+    virtual ~VulkanComputePipeline();
+    VulkanPipelineBindPoint getPipelineBindPoint() const;
+class VulkanDescriptorPool {
+    const VulkanDevice &m_device;
+    VkDescriptorPool m_vkDescriptorPool;
+    VulkanDescriptorPool(const VulkanDescriptorPool &descriptorPool);
+    void VulkanDescriptorPoolCommon(const VulkanDescriptorSetLayoutBindingList
+                                        &descriptorSetLayoutBindingList);
+    VulkanDescriptorPool(
+        const VulkanDevice &device,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding);
+    VulkanDescriptorPool(
+        const VulkanDevice &device,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0,
+        const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1);
+    VulkanDescriptorPool(const VulkanDevice &device,
+                         const VulkanDescriptorSetLayoutBindingList
+                             &descriptorSetLayoutBindingList);
+    virtual ~VulkanDescriptorPool();
+    operator VkDescriptorPool() const;
+class VulkanDescriptorSet {
+    const VulkanDevice &m_device;
+    const VulkanDescriptorPool &m_descriptorPool;
+    VkDescriptorSet m_vkDescriptorSet;
+    VulkanDescriptorSet(const VulkanDescriptorSet &descriptorSet);
+    VulkanDescriptorSet(const VulkanDevice &device,
+                        const VulkanDescriptorPool &descriptorPool,
+                        const VulkanDescriptorSetLayout &descriptorSetLayout);
+    virtual ~VulkanDescriptorSet();
+    void update(uint32_t binding, const VulkanBuffer &buffer);
+    void update(uint32_t binding, const VulkanImageView &imageView);
+    operator VkDescriptorSet() const;
+class VulkanOffset3D {
+    VkOffset3D m_vkOffset3D;
+    VulkanOffset3D(const VulkanOffset3D &extent3D);
+    VulkanOffset3D(uint32_t x = 0, uint32_t y = 0, uint32_t z = 0);
+    virtual ~VulkanOffset3D();
+    uint32_t getX() const;
+    uint32_t getY() const;
+    uint32_t getZ() const;
+    operator VkOffset3D() const;
+class VulkanExtent3D {
+    VkExtent3D m_vkExtent3D;
+    VulkanExtent3D(const VulkanExtent3D &extent3D);
+    VulkanExtent3D(uint32_t width, uint32_t height = 1, uint32_t depth = 1);
+    virtual ~VulkanExtent3D();
+    uint32_t getWidth() const;
+    uint32_t getHeight() const;
+    uint32_t getDepth() const;
+    operator VkExtent3D() const;
+class VulkanCommandPool {
+    const VulkanDevice &m_device;
+    VkCommandPool m_vkCommandPool;
+    VulkanCommandPool(const VulkanCommandPool &commandPool);
+    VulkanCommandPool(
+        const VulkanDevice &device,
+        const VulkanQueueFamily &queueFamily = getVulkanQueueFamily());
+    virtual ~VulkanCommandPool();
+    operator VkCommandPool() const;
+class VulkanCommandBuffer {
+    const VulkanDevice &m_device;
+    const VulkanCommandPool &m_commandPool;
+    VkCommandBuffer m_vkCommandBuffer;
+    VulkanCommandBuffer(const VulkanCommandBuffer &commandBuffer);
+    VulkanCommandBuffer(const VulkanDevice &device,
+                        const VulkanCommandPool &commandPool);
+    virtual ~VulkanCommandBuffer();
+    void begin();
+    void bindPipeline(const VulkanPipeline &pipeline);
+    void bindDescriptorSets(const VulkanPipeline &pipeline,
+                            const VulkanPipelineLayout &pipelineLayout,
+                            const VulkanDescriptorSet &descriptorSet);
+    void pipelineBarrier(const VulkanImage2DList &image2DList,
+                         VulkanImageLayout oldImageLayout,
+                         VulkanImageLayout newImageLayout);
+    void dispatch(uint32_t groupCountX, uint32_t groupCountY,
+                  uint32_t groupCountZ);
+    void fillBuffer(const VulkanBuffer &buffer, uint32_t data,
+                    uint64_t offset = 0, uint64_t size = VK_WHOLE_SIZE);
+    void updateBuffer(const VulkanBuffer &buffer, void *pdata,
+                      uint64_t offset = 0, uint64_t size = VK_WHOLE_SIZE);
+    void copyBufferToImage(const VulkanBuffer &buffer, const VulkanImage &image,
+                           VulkanImageLayout imageLayout =
+                               VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+    void copyBufferToImage(const VulkanBuffer &buffer, const VulkanImage &image,
+                           uint64_t bufferOffset = 0, uint32_t mipLevel = 0,
+                           uint32_t baseArrayLayer = 0, uint32_t layerCount = 1,
+                           VulkanOffset3D offset3D = VulkanOffset3D(0, 0, 0),
+                           VulkanExtent3D extent3D = VulkanExtent3D(0, 0, 0));
+    void copyImageToBuffer(const VulkanImage &image, const VulkanBuffer &buffer,
+                           uint64_t bufferOffset = 0, uint32_t mipLevel = 0,
+                           uint32_t baseArrayLayer = 0, uint32_t layerCount = 1,
+                           VulkanOffset3D offset3D = VulkanOffset3D(0, 0, 0),
+                           VulkanExtent3D extent3D = VulkanExtent3D(0, 0, 0));
+    void end();
+    operator VkCommandBuffer() const;
+class VulkanBuffer {
+    const VulkanDevice &m_device;
+    VkBuffer m_vkBuffer;
+    uint64_t m_size;
+    uint64_t m_alignment;
+    VulkanMemoryTypeList m_memoryTypeList;
+    VulkanBuffer(const VulkanBuffer &buffer);
+    VulkanBuffer(const VulkanDevice &device, uint64_t size,
+                 VulkanExternalMemoryHandleType externalMemoryHandleType =
+                 VulkanBufferUsage bufferUsage =
+                 VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE,
+                 const VulkanQueueFamilyList &queueFamilyList =
+                     getEmptyVulkanQueueFamilyList());
+    virtual ~VulkanBuffer();
+    uint64_t getSize() const;
+    uint64_t getAlignment() const;
+    const VulkanMemoryTypeList &getMemoryTypeList() const;
+    operator VkBuffer() const;
+class VulkanImage {
+    const VulkanDevice &m_device;
+    const VulkanImageType m_imageType;
+    const VulkanExtent3D m_extent3D;
+    const VulkanFormat m_format;
+    const uint32_t m_numMipLevels;
+    const uint32_t m_numLayers;
+    VkImage m_vkImage;
+    uint64_t m_size;
+    uint64_t m_alignment;
+    VulkanMemoryTypeList m_memoryTypeList;
+    VkImageCreateInfo VulkanImageCreateInfo;
+    VulkanImage(const VulkanImage &image);
+    VulkanImage(
+        const VulkanDevice &device, VulkanImageType imageType,
+        VulkanFormat format, const VulkanExtent3D &extent3D,
+        uint32_t numMipLevels = 1, uint32_t arrayLayers = 1,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+        VulkanImageCreateFlag imageCreateFlags = VULKAN_IMAGE_CREATE_FLAG_NONE,
+        VulkanImageTiling imageTiling = VULKAN_IMAGE_TILING_OPTIMAL,
+        VulkanImageUsage imageUsage =
+        VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
+    virtual ~VulkanImage();
+    virtual VulkanExtent3D getExtent3D(uint32_t mipLevel = 0) const;
+    VulkanFormat getFormat() const;
+    uint32_t getNumMipLevels() const;
+    uint32_t getNumLayers() const;
+    uint64_t getSize() const;
+    uint64_t getAlignment() const;
+    const VulkanMemoryTypeList &getMemoryTypeList() const;
+    VkImageCreateInfo getVkImageCreateInfo() const;
+    operator VkImage() const;
+class VulkanImage2D : public VulkanImage {
+    VkImageView m_vkImageView;
+    VulkanImage2D(const VulkanImage2D &image2D);
+    VulkanImage2D(
+        const VulkanDevice &device, VulkanFormat format, uint32_t width,
+        uint32_t height, uint32_t numMipLevels = 1,
+        VulkanExternalMemoryHandleType externalMemoryHandleType =
+        VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
+        VulkanImageUsage imageUsage =
+        VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
+    virtual ~VulkanImage2D();
+    virtual VulkanExtent3D getExtent3D(uint32_t mipLevel = 0) const;
+class VulkanImageView {
+    const VulkanDevice &m_device;
+    VkImageView m_vkImageView;
+    VulkanImageView(const VulkanImageView &imageView);
+    VulkanImageView(const VulkanDevice &device, const VulkanImage &image,
+                    VulkanImageViewType imageViewType,
+                    uint32_t baseMipLevel = 0,
+                    uint32_t mipLevelCount = VULKAN_REMAINING_MIP_LEVELS,
+                    uint32_t baseArrayLayer = 0,
+                    uint32_t layerCount = VULKAN_REMAINING_ARRAY_LAYERS);
+    virtual ~VulkanImageView();
+    operator VkImageView() const;
+class VulkanDeviceMemory {
+    const VulkanDevice &m_device;
+    VkDeviceMemory m_vkDeviceMemory;
+    uint64_t m_size;
+    bool m_isDedicated;
+    VulkanDeviceMemory(const VulkanDeviceMemory &deviceMemory);
+    VulkanDeviceMemory(const VulkanDevice &device, uint64_t size,
+                       const VulkanMemoryType &memoryType,
+                       VulkanExternalMemoryHandleType externalMemoryHandleType =
+                           VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
+                       const void *name = NULL);
+    VulkanDeviceMemory(const VulkanDevice &device, const VulkanImage &image,
+                       const VulkanMemoryType &memoryType,
+                       VulkanExternalMemoryHandleType externalMemoryHandleType =
+                           VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
+                       const void *name = NULL);
+    virtual ~VulkanDeviceMemory();
+    uint64_t getSize() const;
+#ifdef _WIN32
+    getHandle(VulkanExternalMemoryHandleType externalMemoryHandleType) const;
+    int
+    getHandle(VulkanExternalMemoryHandleType externalMemoryHandleType) const;
+    bool isDedicated() const;
+    void *map(size_t offset = 0, size_t size = VK_WHOLE_SIZE);
+    void unmap();
+    void bindBuffer(const VulkanBuffer &buffer, uint64_t offset = 0);
+    void bindImage(const VulkanImage &image, uint64_t offset = 0);
+    operator VkDeviceMemory() const;
+class VulkanSemaphore {
+    friend class VulkanQueue;
+    const VulkanDevice &m_device;
+    VkSemaphore m_vkSemaphore;
+    const std::wstring m_name;
+    VulkanSemaphore(const VulkanSemaphore &semaphore);
+    VulkanSemaphore(
+        const VulkanDevice &device,
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType =
+        const std::wstring name = L"");
+    virtual ~VulkanSemaphore();
+#ifdef _WIN32
+    HANDLE getHandle(
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const;
+    int getHandle(
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const;
+    const std::wstring &getName() const;
+    operator VkSemaphore() const;
+#define VK_FUNC_DECL(name) extern "C" PFN_##name _##name;
+#if defined(_WIN32) || defined(_WIN64)
+#undef VK_FUNC_DECL
+#endif // _vulkan_wrapper_hpp_
diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp
new file mode 100644
index 0000000..359bcae
--- /dev/null
+++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp
@@ -0,0 +1,463 @@

+// Copyright (c) 2022 The Khronos Group Inc.


+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at




+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.



+#ifndef _vulkan_wrapper_types_hpp_

+#define _vulkan_wrapper_types_hpp_


+#include <vulkan/vulkan.h>






+class VulkanInstance;

+class VulkanPhysicalDevice;

+class VulkanMemoryHeap;

+class VulkanMemoryType;

+class VulkanQueueFamily;

+class VulkanDevice;

+class VulkanQueue;

+class VulkanDescriptorSetLayoutBinding;

+class VulkanDescriptorSetLayout;

+class VulkanPipelineLayout;

+class VulkanShaderModule;

+class VulkanPipeline;

+class VulkanComputePipeline;

+class VulkanDescriptorPool;

+class VulkanDescriptorSet;

+class VulkanCommandPool;

+class VulkanCommandBuffer;

+class VulkanBuffer;

+class VulkanOffset3D;

+class VulkanExtent3D;

+class VulkanImage;

+class VulkanImage2D;

+class VulkanImageView;

+class VulkanDeviceMemory;

+class VulkanSemaphore;


+class VulkanPhysicalDeviceList;

+class VulkanMemoryHeapList;

+class VulkanMemoryTypeList;

+class VulkanQueueFamilyList;

+class VulkanQueueFamilyToQueueCountMap;

+class VulkanQueueFamilyToQueueListMap;

+class VulkanQueueList;

+class VulkanCommandBufferList;

+class VulkanDescriptorSetLayoutList;

+class VulkanBufferList;

+class VulkanImage2DList;

+class VulkanImageViewList;

+class VulkanDeviceMemoryList;

+class VulkanSemaphoreList;


+enum VulkanQueueFlag









+enum VulkanDescriptorType





















+enum VulkanShaderStage









+enum VulkanPipelineBindPoint






+enum VulkanMemoryTypeProperty





























+enum VulkanMemoryHeapFlag






+enum VulkanExternalMemoryHandleType














+enum VulkanExternalSemaphoreHandleType














+enum VulkanBufferUsage





















+enum VulkanSharingMode






+enum VulkanImageType







+enum VulkanFormat































































+        VK_FORMAT_A2R10G10B10_USCALED_PACK32,


+        VK_FORMAT_A2R10G10B10_SSCALED_PACK32,






+        VK_FORMAT_A2B10G10R10_USCALED_PACK32,


+        VK_FORMAT_A2B10G10R10_SSCALED_PACK32,


























































































































+enum VulkanImageLayout










+enum VulkanImageUsage





















+enum VulkanImageTiling






+enum VulkanImageCreateFlag











+enum VulkanImageViewType











+#endif // _vulkan_wrapper_types_hpp_

diff --git a/test_conformance/workgroups/CMakeLists.txt b/test_conformance/workgroups/CMakeLists.txt
index 0888608..0c004b3 100644
--- a/test_conformance/workgroups/CMakeLists.txt
+++ b/test_conformance/workgroups/CMakeLists.txt
@@ -5,15 +5,8 @@
-    test_wg_reduce.cpp
-    test_wg_reduce_max.cpp
-    test_wg_reduce_min.cpp
-    test_wg_scan_exclusive_add.cpp
-    test_wg_scan_exclusive_min.cpp
-    test_wg_scan_exclusive_max.cpp
-    test_wg_scan_inclusive_add.cpp
-    test_wg_scan_inclusive_min.cpp
-    test_wg_scan_inclusive_max.cpp
+    test_wg_scan_reduce.cpp
+    test_wg_suggested_local_work_size.cpp
diff --git a/test_conformance/workgroups/main.cpp b/test_conformance/workgroups/main.cpp
index 41ffa74..abb1145 100644
--- a/test_conformance/workgroups/main.cpp
+++ b/test_conformance/workgroups/main.cpp
@@ -24,27 +24,30 @@
 test_definition test_list[] = {
-    ADD_TEST(work_group_all),
-    ADD_TEST(work_group_any),
-    ADD_TEST(work_group_reduce_add),
-    ADD_TEST(work_group_reduce_min),
-    ADD_TEST(work_group_reduce_max),
-    ADD_TEST(work_group_scan_inclusive_add),
-    ADD_TEST(work_group_scan_inclusive_min),
-    ADD_TEST(work_group_scan_inclusive_max),
-    ADD_TEST(work_group_scan_exclusive_add),
-    ADD_TEST(work_group_scan_exclusive_min),
-    ADD_TEST(work_group_scan_exclusive_max),
-    ADD_TEST(work_group_broadcast_1D),
-    ADD_TEST(work_group_broadcast_2D),
-    ADD_TEST(work_group_broadcast_3D),
+    ADD_TEST_VERSION(work_group_all, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_any, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_1D, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_2D, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_3D, Version(2, 0)),
+    ADD_TEST(work_group_suggested_local_size_1D),
+    ADD_TEST(work_group_suggested_local_size_2D),
+    ADD_TEST(work_group_suggested_local_size_3D)
 const int test_num = ARRAY_SIZE(test_list);
 test_status InitCL(cl_device_id device) {
     auto version = get_device_cl_version(device);
-    auto expected_min_version = Version(2, 0);
+    auto expected_min_version = Version(1, 2);
     if (version < expected_min_version)
         version_expected_info("Test", "OpenCL",
diff --git a/test_conformance/workgroups/procs.h b/test_conformance/workgroups/procs.h
index 2e6e79e..6143d52 100644
--- a/test_conformance/workgroups/procs.h
+++ b/test_conformance/workgroups/procs.h
@@ -1,6 +1,6 @@
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,6 +16,7 @@
 #include "harness/testHarness.h"
 #include "harness/kernelHelpers.h"
 #include "harness/errorHelpers.h"
+#include "harness/typeWrappers.h"
 #include "harness/conversions.h"
 #include "harness/mt19937.h"
@@ -36,3 +37,16 @@
 extern int test_work_group_scan_inclusive_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int test_work_group_scan_inclusive_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int test_work_group_scan_inclusive_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_work_group_suggested_local_size_1D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
+extern int test_work_group_suggested_local_size_2D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
+extern int test_work_group_suggested_local_size_3D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index 3555947..2938021 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <algorithm>
 #include "procs.h"
@@ -310,7 +312,7 @@
         localsize[0] = localsize[1] = 1;
-    num_workgroups = MAX(n_elems/wg_size[0], 16);
+    num_workgroups = std::max(n_elems / wg_size[0], (size_t)16);
     globalsize[0] = num_workgroups * localsize[0];
     globalsize[1] = num_workgroups * localsize[1];
     num_elements = globalsize[0] * globalsize[1];
@@ -437,7 +439,7 @@
         localsize[0] = localsize[1] = localsize[2] = 1;
-    num_workgroups = MAX(n_elems/wg_size[0], 8);
+    num_workgroups = std::max(n_elems / wg_size[0], (size_t)8);
     globalsize[0] = num_workgroups * localsize[0];
     globalsize[1] = num_workgroups * localsize[1];
     globalsize[2] = num_workgroups * localsize[2];
diff --git a/test_conformance/workgroups/test_wg_reduce.cpp b/test_conformance/workgroups/test_wg_reduce.cpp
deleted file mode 100644
index eb26f49..0000000
--- a/test_conformance/workgroups/test_wg_reduce.cpp
+++ /dev/null
@@ -1,596 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_reduce_add_kernel_code_int =
-"__kernel void test_wg_reduce_add_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_reduce_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_add_kernel_code_uint =
-"__kernel void test_wg_reduce_add_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_reduce_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_add_kernel_code_long =
-"__kernel void test_wg_reduce_add_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_reduce_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_add_kernel_code_ulong =
-"__kernel void test_wg_reduce_add_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_reduce_add(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_reduce_add_int(int *inptr, int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        int sum = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            sum += inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( sum != outptr[i+j] )
-            {
-                log_info("work_group_reduce_add int: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        unsigned int sum = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            sum += inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( sum != outptr[i+j] )
-            {
-                log_info("work_group_reduce_add uint: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_long sum = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            sum += inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( sum != outptr[i+j] )
-            {
-                log_info("work_group_reduce_add long: Error at %u: expected = %lld, got = %lld\n", i+j, sum, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_ulong sum = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            sum += inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( sum != outptr[i+j] )
-            {
-                log_info("work_group_reduce_add ulong: Error at %u: expected = %llu, got = %llu\n", i+j, sum, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_add_kernel_code_int,
-                                      "test_wg_reduce_add_int");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_add int failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_add int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_add_kernel_code_uint,
-                                      "test_wg_reduce_add_uint");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_add uint failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_add uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_add_kernel_code_long,
-                                      "test_wg_reduce_add_long");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_add long failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_add long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_add_kernel_code_ulong,
-                                      "test_wg_reduce_add_ulong");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_add ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_add ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_reduce_add_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_add_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_add_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_add_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_reduce_max.cpp b/test_conformance/workgroups/test_wg_reduce_max.cpp
deleted file mode 100644
index 3bbd3f2..0000000
--- a/test_conformance/workgroups/test_wg_reduce_max.cpp
+++ /dev/null
@@ -1,632 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_reduce_max_kernel_code_int =
-"__kernel void test_wg_reduce_max_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_reduce_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_max_kernel_code_uint =
-"__kernel void test_wg_reduce_max_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_reduce_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_max_kernel_code_long =
-"__kernel void test_wg_reduce_max_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_reduce_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_max_kernel_code_ulong =
-"__kernel void test_wg_reduce_max_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_reduce_max(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_reduce_max_int(int *inptr, int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        int max = CL_INT_MIN;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            max = (max > inptr[i+j]) ? max : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( max != outptr[i+j] )
-            {
-                log_info("work_group_reduce_max int: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        unsigned int max = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            max = (max > inptr[i+j]) ? max : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( max != outptr[i+j] )
-            {
-                log_info("work_group_reduce_max uint: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_long max = CL_LONG_MIN;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            max = (max > inptr[i+j]) ? max : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( max != outptr[i+j] )
-            {
-                log_info("work_group_reduce_max long: Error at %u: expected = %lld, got = %lld\n", i+j, max, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_ulong max = 0;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            max = (max > inptr[i+j]) ? max : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( max != outptr[i+j] )
-            {
-                log_info("work_group_reduce_max ulong: Error at %u: expected = %llu, got = %llu\n", i+j, max, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_reduce_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_max_kernel_code_int,
-                                      "test_wg_reduce_max_int");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_max int failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_max int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_max_kernel_code_uint,
-                                      "test_wg_reduce_max_uint");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_max uint failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_max uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_max_kernel_code_long,
-                                      "test_wg_reduce_max_long");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_max long failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_max long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_max_kernel_code_ulong,
-                                      "test_wg_reduce_max_ulong");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_max ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_max ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_reduce_max_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_max_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_max_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_max_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_reduce_min.cpp b/test_conformance/workgroups/test_wg_reduce_min.cpp
deleted file mode 100644
index 7b1b22e..0000000
--- a/test_conformance/workgroups/test_wg_reduce_min.cpp
+++ /dev/null
@@ -1,632 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_reduce_min_kernel_code_int =
-"__kernel void test_wg_reduce_min_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_reduce_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_min_kernel_code_uint =
-"__kernel void test_wg_reduce_min_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_reduce_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_min_kernel_code_long =
-"__kernel void test_wg_reduce_min_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_reduce_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_reduce_min_kernel_code_ulong =
-"__kernel void test_wg_reduce_min_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_reduce_min(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_reduce_min_int(int *inptr, int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        int min = CL_INT_MAX;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            min = (min < inptr[i+j]) ? min : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( min != outptr[i+j] )
-            {
-                log_info("work_group_reduce_min int: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        unsigned int min = CL_UINT_MAX;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            min = (min < inptr[i+j]) ? min : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( min != outptr[i+j] )
-            {
-                log_info("work_group_reduce_min uint: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_long min = CL_ULONG_MAX;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            min = (min < inptr[i+j]) ? min : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( min != outptr[i+j] )
-            {
-                log_info("work_group_reduce_min long: Error at %u: expected = %lld, got = %lld\n", i+j, min, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_reduce_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
-    size_t     i, j;
-    for (i=0; i<n; i+=wg_size)
-    {
-        cl_ulong min = CL_ULONG_MAX;
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-            min = (min < inptr[i+j]) ? min : inptr[i+j];
-        for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
-        {
-            if ( min != outptr[i+j] )
-            {
-                log_info("work_group_reduce_min ulong: Error at %u: expected = %llu, got = %llu\n", i+j, min, outptr[i+j]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_reduce_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_min_kernel_code_int,
-                                      "test_wg_reduce_min_int");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_min int failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_min int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_min_kernel_code_uint,
-                                      "test_wg_reduce_min_uint");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_min uint failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_min uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_min_kernel_code_long,
-                                      "test_wg_reduce_min_long");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_min long failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_min long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_reduce_min_kernel_code_ulong,
-                                      "test_wg_reduce_min_ulong");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_reduce_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_reduce_min ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_reduce_min ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_reduce_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_reduce_min_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_min_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_min_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_reduce_min_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp
deleted file mode 100644
index e695a16..0000000
--- a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp
+++ /dev/null
@@ -1,604 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_exclusive_add_kernel_code_int =
-"__kernel void test_wg_scan_exclusive_add_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_exclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_add_kernel_code_uint =
-"__kernel void test_wg_scan_exclusive_add_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_exclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_add_kernel_code_long =
-"__kernel void test_wg_scan_exclusive_add_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_exclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_add_kernel_code_ulong =
-"__kernel void test_wg_scan_exclusive_add_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_exclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_exclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    int s, lasts;
-    for (j = 0; j < n; j += wg_size) {
-        m = n - j;
-        if (m > wg_size) m = wg_size;
-        s = 0;
-        lasts = 0;
-        for (i = 0; i < m; ++i) {
-            s += inptr[j + i];
-            if (outptr[j + i] != lasts) {
-                log_info("work_group_scan_exclusive_add int: Error at %u: expected = %d, got = %d\n",
-                         (unsigned int)(j + i), lasts, outptr[j + i]);
-                return -1;
-            }
-            lasts = s;
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    unsigned int s, lasts;
-    for (j = 0; j < n; j += wg_size) {
-        m = n - j;
-        if (m > wg_size) m = wg_size;
-        s = 0;
-        lasts = 0;
-        for (i = 0; i < m; ++i) {
-            s += inptr[j + i];
-            if (outptr[j + i] != lasts) {
-                log_info("work_group_scan_exclusive_add uint: Error at %u: expected = %u, got = %u\n",
-                        (unsigned int)(j + i), lasts, outptr[j + i]);
-                return -1;
-            }
-            lasts = s;
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    cl_long s, lasts;
-    for (j = 0; j < n; j += wg_size) {
-        m = n - j;
-        if (m > wg_size) m = wg_size;
-        s = 0;
-        lasts = 0;
-        for (i = 0; i < m; ++i) {
-            s += inptr[j + i];
-            if (outptr[j + i] != lasts) {
-                log_info("work_group_scan_exclusive_add long: Error at %u: expected = %lld, got = %lld\n",
-                         (unsigned int)(j + i), (long long)lasts, (long long)outptr[j + i]);
-                return -1;
-            }
-            lasts = s;
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    cl_ulong s, lasts;
-    for (j = 0; j < n; j += wg_size) {
-        m = n - j;
-        if (m > wg_size) m = wg_size;
-        s = 0;
-        lasts = 0;
-        for (i = 0; i < m; ++i) {
-            s += inptr[j + i];
-            if (outptr[j + i] != lasts) {
-                log_info("work_group_scan_exclusive_add ulong: Error at %u: expected = %llu, got = %llu\n",
-                         (unsigned int)(j + i), (unsigned long long)lasts, (unsigned long long)outptr[j + i]);
-                return -1;
-            }
-            lasts = s;
-        }
-    }
-    return 0;
-test_work_group_scan_exclusive_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_add_kernel_code_int,
-                                      "test_wg_scan_exclusive_add_int");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_add int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_add int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_add_kernel_code_uint,
-                                      "test_wg_scan_exclusive_add_uint");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_add uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_add uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_add_kernel_code_long,
-                                      "test_wg_scan_exclusive_add_long");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_add long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_add long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_add_kernel_code_ulong,
-                                      "test_wg_scan_exclusive_add_ulong");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_add ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_exclusive_add_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_add_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_add_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_add_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
deleted file mode 100644
index 12338b6..0000000
--- a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
+++ /dev/null
@@ -1,631 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_exclusive_max_kernel_code_int =
-"__kernel void test_wg_scan_exclusive_max_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_exclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_max_kernel_code_uint =
-"__kernel void test_wg_scan_exclusive_max_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_exclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_max_kernel_code_long =
-"__kernel void test_wg_scan_exclusive_max_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_exclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_max_kernel_code_ulong =
-"__kernel void test_wg_scan_exclusive_max_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_exclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_exclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        int max_ = 0x80000000;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_exclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-            max_ = MAX(inptr[j+i], max_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        unsigned int max_ = 0x0;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_exclusive_max int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-            max_ = MAX(inptr[j+i], max_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_long max_ = 0x8000000000000000ULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_exclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-            max_ = MAX(inptr[j+i], max_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_ulong max_ = 0x0;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_exclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-            max_ = MAX(inptr[j+i], max_);
-        }
-    }
-    return 0;
-test_work_group_scan_exclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_max_kernel_code_int,
-                                      "test_wg_scan_exclusive_max_int");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_max int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_max int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_max_kernel_code_uint,
-                                      "test_wg_scan_exclusive_max_uint");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_max uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_max uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_max_kernel_code_long,
-                                      "test_wg_scan_exclusive_max_long");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_max long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_max long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_max_kernel_code_ulong,
-                                      "test_wg_scan_exclusive_max_ulong");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_max ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_exclusive_max_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_max_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_max_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_max_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
deleted file mode 100644
index f4e6bf9..0000000
--- a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
+++ /dev/null
@@ -1,632 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_exclusive_min_kernel_code_int =
-"__kernel void test_wg_scan_exclusive_min_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_exclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_min_kernel_code_uint =
-"__kernel void test_wg_scan_exclusive_min_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_exclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_min_kernel_code_long =
-"__kernel void test_wg_scan_exclusive_min_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_exclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_exclusive_min_kernel_code_ulong =
-"__kernel void test_wg_scan_exclusive_min_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_exclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_exclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        int min_ = 0x7fffffff;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_exclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-            min_ = MIN(inptr[j+i], min_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        unsigned int min_ = 0xffffffff;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_exclusive_min int: Error at %u: expected = %u, got = %u\n", j+i, min_, outptr[j+i]);
-                return -1;
-            }
-            min_ = MIN(inptr[j+i], min_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_long min_ = 0x7fffffffffffffffULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_exclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-            min_ = MIN(inptr[j+i], min_);
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_exclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-     for (j=0; j<n; j+=wg_size) {
-        cl_ulong min_ = 0xffffffffffffffffULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_exclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-            min_ = MIN(inptr[j+i], min_);
-        }
-    }
-    return 0;
-test_work_group_scan_exclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_min_kernel_code_int,
-                                      "test_wg_scan_exclusive_min_int");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_min int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_min int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_min_kernel_code_uint,
-                                      "test_wg_scan_exclusive_min_uint");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_min uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_min uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_min_kernel_code_long,
-                                      "test_wg_scan_exclusive_min_long");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusive_min long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_min long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       wg_sizes_per_dimension[3];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_exclusive_min_kernel_code_ulong,
-                                      "test_wg_scan_exclusive_min_ulong");
-    if (err)
-        return -1;
-    err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
-    if (err)
-        return -1;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
-    if (err)
-        return -1;
-    if(wg_sizes_per_dimension[0] < wg_size[0])
-    {
-        wg_size[0] = wg_sizes_per_dimension[0];
-    }
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_exclusive_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_exclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_exclusive_min ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_exclusive_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_exclusive_min_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_min_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_min_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_exclusive_min_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp
deleted file mode 100644
index 51c98a4..0000000
--- a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_inclusive_add_kernel_code_int =
-"__kernel void test_wg_scan_inclusive_add_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_inclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_add_kernel_code_uint =
-"__kernel void test_wg_scan_inclusive_add_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_inclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_add_kernel_code_long =
-"__kernel void test_wg_scan_inclusive_add_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_inclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_add_kernel_code_ulong =
-"__kernel void test_wg_scan_inclusive_add_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_inclusive_add(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_inclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size)
-    size_t i, j, m;
-    int s;
-    for (j=0; j<n; j+=wg_size) {
-    m = n - j;
-    if (m > wg_size)
-        m = wg_size;
-    s = 0;
-    for (i=0; i<m; ++i) {
-        s += inptr[j+i];
-        if (outptr[j+i] != s) {
-                log_info("work_group_scan_inclusive_add int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), s, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
-    size_t i, j, m;
-    unsigned int s;
-    for (j=0; j<n; j+=wg_size) {
-    m = n - j;
-    if (m > wg_size)
-        m = wg_size;
-    s = 0;
-    for (i=0; i<m; ++i) {
-        s += inptr[j+i];
-        if (outptr[j+i] != s) {
-                log_info("work_group_scan_inclusive_add uint: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), s, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
-    size_t i, j, m;
-    cl_long s;
-    for (j=0; j<n; j+=wg_size) {
-    m = n - j;
-    if (m > wg_size)
-        m = wg_size;
-    s = 0;
-    for (i=0; i<m; ++i) {
-        s += inptr[j+i];
-        if (outptr[j+i] != s) {
-                log_info("work_group_scan_inclusive_add long: Error at %u: expected = %lld, got = %lld\n",
-            (unsigned int)(j+i), (long long)s, (long long)outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
-    size_t i, j, m;
-    cl_ulong s;
-    for (j=0; j<n; j+=wg_size) {
-    m = n - j;
-    if (m > wg_size)
-        m = wg_size;
-    s = 0;
-    for (i=0; i<m; ++i) {
-        s += inptr[j+i];
-        if (outptr[j+i] != s) {
-                log_info("work_group_scan_inclusive_add int: Error at %u: expected = %llu, got = %llu\n",
-            (unsigned int)(j+i), (unsigned long long)s, (unsigned long long)outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_scan_inclusive_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_add_kernel_code_int,
-                                      "test_wg_scan_inclusive_add_int");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_add int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_add int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_add_kernel_code_uint,
-                                      "test_wg_scan_inclusive_add_uint");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_add uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_add uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_add_kernel_code_long,
-                                      "test_wg_scan_inclusive_add_long");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_add long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_add long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_add_kernel_code_ulong,
-                                      "test_wg_scan_inclusive_add_ulong");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_add ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_inclusive_add_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_add_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_add_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_add_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
deleted file mode 100644
index 44ebf80..0000000
--- a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
+++ /dev/null
@@ -1,595 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_inclusive_max_kernel_code_int =
-"__kernel void test_wg_scan_inclusive_max_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_inclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_max_kernel_code_uint =
-"__kernel void test_wg_scan_inclusive_max_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_inclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_max_kernel_code_long =
-"__kernel void test_wg_scan_inclusive_max_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_inclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_max_kernel_code_ulong =
-"__kernel void test_wg_scan_inclusive_max_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_inclusive_max(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_inclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        int max_ = 0x80000000;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_inclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        unsigned int max_ = 0x0;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_inclusive_max int: Error at %lu: expected = %u, got = %u\n", (unsigned long)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_long max_ = 0x8000000000000000ULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_inclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_ulong max_ = 0x0;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
-            if (outptr[j+i] != max_) {
-                log_info("work_group_scan_inclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_scan_inclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_max_kernel_code_int,
-                                      "test_wg_scan_inclusive_max_int");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_max int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_max int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_max_kernel_code_uint,
-                                      "test_wg_scan_inclusive_max_uint");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_max uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_max uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_max_kernel_code_long,
-                                      "test_wg_scan_inclusive_max_long");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_max long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_max long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_max_kernel_code_ulong,
-                                      "test_wg_scan_inclusive_max_ulong");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_max ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_inclusive_max_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_max_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_max_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_max_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
deleted file mode 100644
index f2f0578..0000000
--- a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
+++ /dev/null
@@ -1,595 +0,0 @@
-// Copyright (c) 2017 The Khronos Group Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "harness/compat.h"
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "procs.h"
-const char *wg_scan_inclusive_min_kernel_code_int =
-"__kernel void test_wg_scan_inclusive_min_int(global int *input, global int *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    int result = work_group_scan_inclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_min_kernel_code_uint =
-"__kernel void test_wg_scan_inclusive_min_uint(global uint *input, global uint *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    uint result = work_group_scan_inclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_min_kernel_code_long =
-"__kernel void test_wg_scan_inclusive_min_long(global long *input, global long *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    long result = work_group_scan_inclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-const char *wg_scan_inclusive_min_kernel_code_ulong =
-"__kernel void test_wg_scan_inclusive_min_ulong(global ulong *input, global ulong *output)\n"
-"    int  tid = get_global_id(0);\n"
-"    ulong result = work_group_scan_inclusive_min(input[tid]);\n"
-"    output[tid] = result;\n"
-static int
-verify_wg_scan_inclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        int min_ = 0x7fffffff;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_inclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        unsigned int min_ = 0xffffffff;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_inclusive_min int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-    for (j=0; j<n; j+=wg_size) {
-        cl_long min_ = 0x7fffffffffffffffULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_inclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-static int
-verify_wg_scan_inclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) {
-    size_t i, j, m;
-     for (j=0; j<n; j+=wg_size) {
-        cl_ulong min_ = 0xffffffffffffffffULL;
-        m = n - j;
-        if (m > wg_size)
-            m = wg_size;
-        for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
-            if (outptr[j+i] != min_) {
-                log_info("work_group_scan_inclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]);
-                return -1;
-            }
-        }
-    }
-    return 0;
-test_work_group_scan_inclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_int       *input_ptr[1], *p;
-    cl_int       *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_min_kernel_code_int,
-                                      "test_wg_scan_inclusive_min_int");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_min int failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_min int passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_uint      *input_ptr[1], *p;
-    cl_uint      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_min_kernel_code_uint,
-                                      "test_wg_scan_inclusive_min_uint");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_min uint failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_min uint passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_long      *input_ptr[1], *p;
-    cl_long      *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_min_kernel_code_long,
-                                      "test_wg_scan_inclusive_min_long");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_long) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusive_min long failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_min long passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    cl_mem       streams[2];
-    cl_ulong     *input_ptr[1], *p;
-    cl_ulong     *output_ptr;
-    cl_program   program;
-    cl_kernel    kernel;
-    void         *values[2];
-    size_t       threads[1];
-    size_t       wg_size[1];
-    size_t       num_elements;
-    int          err;
-    int          i;
-    MTdata       d;
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &wg_scan_inclusive_min_kernel_code_ulong,
-                                      "test_wg_scan_inclusive_min_ulong");
-    if (err)
-        return -1;
-    // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
-    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
-    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
-    num_elements = n_elems;
-    input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_ulong) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    p = input_ptr[0];
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        p[i] = genrand_int64(d);
-    free_mtdata(d); d = NULL;
-    err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-    values[0] = streams[0];
-    values[1] = streams[1];
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-    // Line below is troublesome...
-    threads[0] = (size_t)n_elems;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-    cl_uint dead = 0xdeaddead;
-    memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-    if (verify_wg_scan_inclusive_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
-    {
-        log_error("work_group_scan_inclusiveadd ulong failed\n");
-        return -1;
-    }
-    log_info("work_group_scan_inclusive_min ulong passed\n");
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr[0]);
-    free(output_ptr);
-    return err;
-test_work_group_scan_inclusive_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-    int err;
-    err = test_work_group_scan_inclusive_min_int(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_min_uint(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_min_long(device, context, queue, n_elems);
-    if (err) return err;
-    err = test_work_group_scan_inclusive_min_ulong(device, context, queue, n_elems);
-    return err;
diff --git a/test_conformance/workgroups/test_wg_scan_reduce.cpp b/test_conformance/workgroups/test_wg_scan_reduce.cpp
new file mode 100644
index 0000000..bf4dc89
--- /dev/null
+++ b/test_conformance/workgroups/test_wg_scan_reduce.cpp
@@ -0,0 +1,456 @@
+// Copyright (c) 2017-2022 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "harness/compat.h"
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include "procs.h"
+static std::string make_kernel_string(const std::string &type,
+                                      const std::string &kernelName,
+                                      const std::string &func)
+    // Build a kernel string of the form:
+    // __kernel void KERNEL_NAME(global TYPE *input, global TYPE *output) {
+    //     int  tid = get_global_id(0);
+    //     output[tid] = FUNC(input[tid]);
+    // }
+    std::ostringstream os;
+    os << "__kernel void " << kernelName << "(global " << type
+       << " *input, global " << type << " *output) {\n";
+    os << "    int tid = get_global_id(0);\n";
+    os << "    output[tid] = " << func << "(input[tid]);\n";
+    os << "}\n";
+    return os.str();
+template <typename T> struct TestTypeInfo
+template <> struct TestTypeInfo<cl_int>
+    static constexpr const char *deviceName = "int";
+template <> struct TestTypeInfo<cl_uint>
+    static constexpr const char *deviceName = "uint";
+template <> struct TestTypeInfo<cl_long>
+    static constexpr const char *deviceName = "long";
+template <> struct TestTypeInfo<cl_ulong>
+    static constexpr const char *deviceName = "ulong";
+template <typename T> struct Add
+    using Type = T;
+    static constexpr const char *opName = "add";
+    static constexpr T identityValue = 0;
+    static T combine(T a, T b) { return a + b; }
+template <typename T> struct Max
+    using Type = T;
+    static constexpr const char *opName = "max";
+    static constexpr T identityValue = std::numeric_limits<T>::min();
+    static T combine(T a, T b) { return std::max(a, b); }
+template <typename T> struct Min
+    using Type = T;
+    static constexpr const char *opName = "min";
+    static constexpr T identityValue = std::numeric_limits<T>::max();
+    static T combine(T a, T b) { return std::min(a, b); }
+template <typename C> struct Reduce
+    using Type = typename C::Type;
+    static constexpr const char *testName = "work_group_reduce";
+    static constexpr const char *testOpName = C::opName;
+    static constexpr const char *deviceTypeName =
+        TestTypeInfo<Type>::deviceName;
+    static constexpr const char *kernelName = "test_wg_reduce";
+    static int verify(Type *inptr, Type *outptr, size_t n_elems,
+                      size_t max_wg_size)
+    {
+        for (size_t i = 0; i < n_elems; i += max_wg_size)
+        {
+            size_t wg_size = std::min(max_wg_size, n_elems - i);
+            Type result = C::identityValue;
+            for (size_t j = 0; j < wg_size; j++)
+            {
+                result = C::combine(result, inptr[i + j]);
+            }
+            for (size_t j = 0; j < wg_size; j++)
+            {
+                if (result != outptr[i + j])
+                {
+                    log_info("%s_%s: Error at %zu\n", testName, testOpName,
+                             i + j);
+                    return -1;
+                }
+            }
+        }
+        return 0;
+    }
+template <typename C> struct ScanInclusive
+    using Type = typename C::Type;
+    static constexpr const char *testName = "work_group_scan_inclusive";
+    static constexpr const char *testOpName = C::opName;
+    static constexpr const char *deviceTypeName =
+        TestTypeInfo<Type>::deviceName;
+    static constexpr const char *kernelName = "test_wg_scan_inclusive";
+    static int verify(Type *inptr, Type *outptr, size_t n_elems,
+                      size_t max_wg_size)
+    {
+        for (size_t i = 0; i < n_elems; i += max_wg_size)
+        {
+            size_t wg_size = std::min(max_wg_size, n_elems - i);
+            Type result = C::identityValue;
+            for (size_t j = 0; j < wg_size; ++j)
+            {
+                result = C::combine(result, inptr[i + j]);
+                if (result != outptr[i + j])
+                {
+                    log_info("%s_%s: Error at %zu\n", testName, testOpName,
+                             i + j);
+                    return -1;
+                }
+            }
+        }
+        return 0;
+    }
+template <typename C> struct ScanExclusive
+    using Type = typename C::Type;
+    static constexpr const char *testName = "work_group_scan_exclusive";
+    static constexpr const char *testOpName = C::opName;
+    static constexpr const char *deviceTypeName =
+        TestTypeInfo<Type>::deviceName;
+    static constexpr const char *kernelName = "test_wg_scan_exclusive";
+    static int verify(Type *inptr, Type *outptr, size_t n_elems,
+                      size_t max_wg_size)
+    {
+        for (size_t i = 0; i < n_elems; i += max_wg_size)
+        {
+            size_t wg_size = std::min(max_wg_size, n_elems - i);
+            Type result = C::identityValue;
+            for (size_t j = 0; j < wg_size; ++j)
+            {
+                if (result != outptr[i + j])
+                {
+                    log_info("%s_%s: Error at %zu\n", testName, testOpName,
+                             i + j);
+                    return -1;
+                }
+                result = C::combine(result, inptr[i + j]);
+            }
+        }
+        return 0;
+    }
+template <typename TestInfo>
+static int run_test(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int n_elems)
+    using T = typename TestInfo::Type;
+    cl_int err = CL_SUCCESS;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    std::string funcName = TestInfo::testName;
+    funcName += "_";
+    funcName += TestInfo::testOpName;
+    std::string kernelName = TestInfo::kernelName;
+    kernelName += "_";
+    kernelName += TestInfo::testOpName;
+    kernelName += "_";
+    kernelName += TestInfo::deviceTypeName;
+    std::string kernelString =
+        make_kernel_string(TestInfo::deviceTypeName, kernelName, funcName);
+    const char *kernel_source = kernelString.c_str();
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &kernel_source, kernelName.c_str());
+    test_error(err, "Unable to create test kernel");
+    size_t wg_size[1];
+    err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
+    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
+    clMemWrapper src = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                      sizeof(T) * n_elems, NULL, &err);
+    test_error(err, "Unable to create source buffer");
+    clMemWrapper dst = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                      sizeof(T) * n_elems, NULL, &err);
+    test_error(err, "Unable to create destination buffer");
+    std::vector<T> input_ptr(n_elems);
+    MTdataHolder d(gRandomSeed);
+    for (int i = 0; i < n_elems; i++)
+    {
+        input_ptr[i] = (T)genrand_int64(d);
+    }
+    err = clEnqueueWriteBuffer(queue, src, CL_TRUE, 0, sizeof(T) * n_elems,
+                     , 0, NULL, NULL);
+    test_error(err, "clWriteBuffer to initialize src buffer failed");
+    err = clSetKernelArg(kernel, 0, sizeof(src), &src);
+    test_error(err, "Unable to set src buffer kernel arg");
+    err |= clSetKernelArg(kernel, 1, sizeof(dst), &dst);
+    test_error(err, "Unable to set dst buffer kernel arg");
+    size_t global_work_size[] = { (size_t)n_elems };
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
+                                 wg_size, 0, NULL, NULL);
+    test_error(err, "Unable to enqueue test kernel");
+    std::vector<T> output_ptr(n_elems);
+    cl_uint dead = 0xdeaddead;
+    memset_pattern4(, &dead, sizeof(T) * n_elems);
+    err = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0, sizeof(T) * n_elems,
+                    , 0, NULL, NULL);
+    test_error(err, "clEnqueueReadBuffer to read read dst buffer failed");
+    if (TestInfo::verify(,, n_elems,
+                         wg_size[0]))
+    {
+        log_error("%s_%s %s failed\n", TestInfo::testName, TestInfo::testOpName,
+                  TestInfo::deviceTypeName);
+        return TEST_FAIL;
+    }
+    log_info("%s_%s %s passed\n", TestInfo::testName, TestInfo::testOpName,
+             TestInfo::deviceTypeName);
+    return TEST_PASS;
+int test_work_group_reduce_add(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |= run_test<Reduce<Add<cl_int>>>(device, context, queue, n_elems);
+    result |= run_test<Reduce<Add<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |=
+            run_test<Reduce<Add<cl_long>>>(device, context, queue, n_elems);
+        result |=
+            run_test<Reduce<Add<cl_ulong>>>(device, context, queue, n_elems);
+    }
+    return result;
+int test_work_group_reduce_max(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |= run_test<Reduce<Max<cl_int>>>(device, context, queue, n_elems);
+    result |= run_test<Reduce<Max<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |=
+            run_test<Reduce<Max<cl_long>>>(device, context, queue, n_elems);
+        result |=
+            run_test<Reduce<Max<cl_ulong>>>(device, context, queue, n_elems);
+    }
+    return result;
+int test_work_group_reduce_min(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |= run_test<Reduce<Min<cl_int>>>(device, context, queue, n_elems);
+    result |= run_test<Reduce<Min<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |=
+            run_test<Reduce<Min<cl_long>>>(device, context, queue, n_elems);
+        result |=
+            run_test<Reduce<Min<cl_ulong>>>(device, context, queue, n_elems);
+    }
+    return result;
+int test_work_group_scan_inclusive_add(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanInclusive<Add<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanInclusive<Add<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanInclusive<Add<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanInclusive<Add<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
+int test_work_group_scan_inclusive_max(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanInclusive<Max<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanInclusive<Max<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanInclusive<Max<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanInclusive<Max<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
+int test_work_group_scan_inclusive_min(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanInclusive<Min<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanInclusive<Min<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanInclusive<Min<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanInclusive<Min<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
+int test_work_group_scan_exclusive_add(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanExclusive<Add<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanExclusive<Add<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanExclusive<Add<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanExclusive<Add<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
+int test_work_group_scan_exclusive_max(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanExclusive<Max<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanExclusive<Max<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanExclusive<Max<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanExclusive<Max<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
+int test_work_group_scan_exclusive_min(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int n_elems)
+    int result = TEST_PASS;
+    result |=
+        run_test<ScanExclusive<Min<cl_int>>>(device, context, queue, n_elems);
+    result |=
+        run_test<ScanExclusive<Min<cl_uint>>>(device, context, queue, n_elems);
+    if (gHasLong)
+    {
+        result |= run_test<ScanExclusive<Min<cl_long>>>(device, context, queue,
+                                                        n_elems);
+        result |= run_test<ScanExclusive<Min<cl_ulong>>>(device, context, queue,
+                                                         n_elems);
+    }
+    return result;
diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
new file mode 100644
index 0000000..aa02391
--- /dev/null
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -0,0 +1,611 @@
+// Copyright (c) 2021 The Khronos Group Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "harness/compat.h"
+#include <stdio.h>
+#include <iostream>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "procs.h"
+#include <CL/cl_ext.h>
+/** @brief Gets the number of elements of type s in a fixed length array of s */
+#define NELEMS(s) (sizeof(s) / sizeof((s)[0]))
+#define test_error_ret_and_free(errCode, msg, retValue, ptr)                   \
+    {                                                                          \
+        auto errCodeResult = errCode;                                          \
+        if (errCodeResult != CL_SUCCESS)                                       \
+        {                                                                      \
+            print_error(errCodeResult, msg);                                   \
+            free(ptr);                                                         \
+            return retValue;                                                   \
+        }                                                                      \
+    }
+const char* wg_scan_local_work_group_size = R"(
+    bool is_zero_linear_id()
+    {
+        size_t linear_id;
+        linear_id = ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) * get_global_size(0)) +
+                    ((get_global_id(1) - get_global_offset(1)) * get_global_size(0)) +
+                    (get_global_id(0) - get_global_offset(0));
+        linear_id = get_global_linear_id();
+        return linear_id == 0;
+    }
+    uint get_l_size(size_t dim)
+    {
+        return get_local_size(dim);
+        return get_enqueued_local_size(dim);
+    }
+    __kernel void test_wg_scan_local_work_group_size(global uint *output)
+    {
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    }
+    __kernel void test_wg_scan_local_work_group_size_static_local(
+                                            global uint *output)
+    {
+        __local char c[LOCAL_MEM_SIZE];
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    }
+    __kernel void test_wg_scan_local_work_group_size_dynlocal(
+                                        global uint *output,
+                                        __local char * c)
+    {
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    };)";
+bool is_prime(size_t a)
+    size_t c;
+    for (c = 2; c < a; c++)
+    {
+        if (a % c == 0) return false;
+    }
+    return true;
+bool is_not_prime(size_t a) { return !is_prime(a); }
+bool is_not_even(size_t a) { return (is_prime(a) || (a % 2 == 1)); }
+bool is_not_odd(size_t a) { return (is_prime(a) || (a % 2 == 0)); }
+#define NELEMS(s) (sizeof(s) / sizeof((s)[0]))
+/* The numbers we chose in the value_range are to be used for the second and
+   third dimension of the global work group size. The numbers below cover many
+   different cases: 1024 is a power of 2, 3 is an odd and small prime number, 12
+   is a multiple of 4 but not a power of 2, 1031 is a large odd and prime number
+   and 1 is to test the lack of this dimension if the others are present */
+const size_t value_range[] = { 1024, 3, 12, 1031, 1 };
+/* The value_range_nD contains numbers to be used for the experiments with 2D
+   and 3D global work sizes. This is because we need smaller numbers so that the
+   resulting number of work items is meaningful and does not become too large.
+   The cases here are: 64 that is a power of 2, 3 is an odd and small prime
+   number, 12 is a multiple of 4 but not a power of 2, 113 is a large prime
+   number
+   and 1 is to test the lack of this dimension if the others are present */
+const size_t value_range_nD[] = { 64, 3, 12, 113, 1 };
+const size_t basic_increment = 16;
+const size_t primes_increment = 1;
+enum num_dims
+    _1D = 1,
+    _2D = 2,
+    _3D = 3
+int do_test(cl_device_id device, cl_context context, cl_command_queue queue,
+            cl_kernel scan_kernel, int work_dim, size_t global_work_offset[3],
+            size_t test_values[3], size_t dyn_mem_size)
+    size_t local_work_size[] = { 1, 1, 1 };
+    size_t suggested_total_size;
+    size_t workgroupinfo_size;
+    cl_uint kernel_work_size[3] = { 0 };
+    clMemWrapper buffer;
+    cl_platform_id platform;
+    int err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
+                              &platform, NULL);
+    test_error_ret(err, "clGetDeviceInfo failed", -1);
+    clGetKernelSuggestedLocalWorkSizeKHR_fn
+        clGetKernelSuggestedLocalWorkSizeKHR =
+            (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+                clGetExtensionFunctionAddressForPlatform(
+                    platform, "clGetKernelSuggestedLocalWorkSizeKHR");
+    if (clGetKernelSuggestedLocalWorkSizeKHR == NULL)
+    {
+        log_info("Extension 'cl_khr_suggested_local_work_size' could not be "
+                 "found.\n");
+        return TEST_FAIL;
+    }
+    /* Create the actual buffer, using local_buffer as the host pointer, and ask
+     * to copy that into the buffer */
+    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                            sizeof(kernel_work_size), NULL, &err);
+    test_error_ret(err, "clCreateBuffer failed", -1);
+    err = clSetKernelArg(scan_kernel, 0, sizeof(buffer), &buffer);
+    test_error_ret(err, "clSetKernelArg failed", -1);
+    if (dyn_mem_size)
+    {
+        err = clSetKernelArg(scan_kernel, 1, dyn_mem_size, NULL);
+        test_error_ret(err, "clSetKernelArg failed", -1);
+    }
+    err = clGetKernelSuggestedLocalWorkSizeKHR(queue, scan_kernel, work_dim,
+                                               global_work_offset, test_values,
+                                               local_work_size);
+    test_error_ret(err, "clGetKernelSuggestedLocalWorkSizeKHR failed", -1);
+    suggested_total_size =
+        local_work_size[0] * local_work_size[1] * local_work_size[2];
+    err = clGetKernelWorkGroupInfo(
+        scan_kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
+        sizeof(workgroupinfo_size), &workgroupinfo_size, NULL);
+    test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1);
+    if (suggested_total_size > workgroupinfo_size)
+    {
+        std::cout << "The suggested work group size consist of "
+                  << suggested_total_size << " work items.\n"
+                  << "Work items are limited by " << workgroupinfo_size
+                  << std::endl;
+        std::cout << "Size from clGetKernelWorkGroupInfo: "
+                  << workgroupinfo_size;
+        std::cout << "\nSize from clGetKernelSuggestedLocalWorkSizeKHR: "
+                  << local_work_size[0] * local_work_size[1]
+                * local_work_size[2]
+                  << std::endl;
+        return -1;
+    }
+    err =
+        clEnqueueNDRangeKernel(queue, scan_kernel, work_dim, global_work_offset,
+                               test_values, // global work size
+                               NULL, 0, NULL, NULL);
+    test_error_ret(err, "clEnqueueNDRangeKernel failed", -1);
+    err = clEnqueueReadBuffer(queue, buffer, CL_NON_BLOCKING, 0,
+                              sizeof(kernel_work_size), kernel_work_size, 0,
+                              NULL, NULL);
+    test_error_ret(err, "clEnqueueReadBuffer failed", -1);
+    err = clFinish(queue);
+    test_error_ret(err, "clFinish failed", -1);
+    if (kernel_work_size[0] != local_work_size[0]
+        || kernel_work_size[1] != local_work_size[1]
+        || kernel_work_size[2] != local_work_size[2])
+    {
+        std::cout
+            << "Kernel work size differs from local work size suggested:\n"
+            << "Kernel work size: (" << kernel_work_size[0] << ", "
+            << kernel_work_size[1] << ", " << kernel_work_size[2] << ")"
+            << "Local work size: (" << local_work_size[0] << ", "
+            << local_work_size[1] << ", " << local_work_size[2] << ")\n";
+        return -1;
+    }
+    return err;
+int do_test_work_group_suggested_local_size(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
+    cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim)
+    clProgramWrapper scan_program;
+    clKernelWrapper scan_kernel;
+    int err;
+    size_t test_values[] = { 1, 1, 1 };
+    std::string kernel_names[6] = {
+        "test_wg_scan_local_work_group_size",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_dynlocal"
+    };
+    std::string str_local_mem_size[6] = {
+        "-DLOCAL_MEM_SIZE=1",     "-DLOCAL_MEM_SIZE=1024",
+        "-DLOCAL_MEM_SIZE=4096",  "-DLOCAL_MEM_SIZE=16384",
+        "-DLOCAL_MEM_SIZE=32768", "-DLOCAL_MEM_SIZE=1"
+    };
+    size_t local_mem_size[6] = { 1, 1024, 4096, 16384, 32768, 1 };
+    size_t dyn_mem_size[6] = { 0, 0, 0, 0, 0, 1024 };
+    cl_ulong kernel_local_mem_size;
+    for (int kernel_num = 0; kernel_num < 6; kernel_num++)
+    {
+        if (max_local_mem_size < local_mem_size[kernel_num]) continue;
+        // Create the kernel
+        err = create_single_kernel_helper(
+            context, &scan_program, &scan_kernel, 1,
+            &wg_scan_local_work_group_size, (kernel_names[kernel_num]).c_str(),
+            (str_local_mem_size[kernel_num]).c_str());
+        test_error_ret(err,
+                       ("create_single_kernel_helper failed for kernel "
+                        + kernel_names[kernel_num])
+                           .c_str(),
+                       -1);
+        // Check if the local memory used by the kernel is going to exceed the
+        // max_local_mem_size
+        err = clGetKernelWorkGroupInfo(
+            scan_kernel, device, CL_KERNEL_LOCAL_MEM_SIZE,
+            sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
+        test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1);
+        if (kernel_local_mem_size > max_local_mem_size) continue;
+        // return error if no number is found due to the skip condition
+        err = -1;
+        unsigned int j = 0;
+        size_t num_elems = NELEMS(value_range);
+        for (size_t i = start; i < end; i += incr)
+        {
+            if (skip_cond(i)) continue;
+            err = 0;
+            test_values[0] = i;
+            if (dim == _2D) test_values[1] = value_range_nD[j++ % num_elems];
+            if (dim == _3D)
+            {
+                test_values[1] = value_range_nD[j++ % num_elems];
+                test_values[2] = value_range_nD[rand() % num_elems];
+            }
+            err |= do_test(device, context, queue, scan_kernel, dim,
+                           global_work_offset, test_values,
+                           dyn_mem_size[kernel_num]);
+            test_error_ret(
+                err,
+                ("do_test failed for kernel " + kernel_names[kernel_num])
+                    .c_str(),
+                -1);
+        }
+    }
+    return err;
+int test_work_group_suggested_local_size_1D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D odds passed\n");
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D evens passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = 2 * max_work_items;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D primes passed\n");
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D odds with "
+             "global_work_offset passed\n");
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D evens with "
+             "global_work_offset passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = 2 * max_work_items;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D primes with "
+             "global_work_offset passed\n");
+    return err;
+int test_work_group_suggested_local_size_2D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D odds passed\n");
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D evens passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D primes passed\n");
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D odds with "
+             "global_work_offset passed\n");
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D evens with "
+             "global_work_offset passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D primes with "
+             "global_work_offset passed\n");
+    return err;
+int test_work_group_suggested_local_size_3D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+    // odds
+    start = 1;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D odds passed\n");
+    // evens
+    start = 2;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D evens passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D primes passed\n");
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+    // odds
+    start = 1;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D odds with "
+             "global_work_offset passed\n");
+    // evens
+    start = 2;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D evens with "
+             "global_work_offset passed\n");
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D primes with "
+             "global_work_offset passed\n");
+    return err;