| // |
| // Copyright (c) 2017 The Khronos Group Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| #include "harness/compat.h" |
| #include "harness/kernelHelpers.h" |
| #include "harness/testHarness.h" |
| |
| #include <string.h> |
| |
| #include <algorithm> |
| |
| #include "cl_utils.h" |
| #include "tests.h" |
| |
| #include <CL/cl_half.h> |
| |
| typedef struct ComputeReferenceInfoF_ |
| { |
| float *x; |
| cl_ushort *r; |
| f2h f; |
| cl_ulong i; |
| cl_uint lim; |
| cl_uint count; |
| } ComputeReferenceInfoF; |
| |
| typedef struct ComputeReferenceInfoD_ |
| { |
| double *x; |
| cl_ushort *r; |
| d2h f; |
| cl_ulong i; |
| cl_uint lim; |
| cl_uint count; |
| } ComputeReferenceInfoD; |
| |
| typedef struct CheckResultInfoF_ |
| { |
| const float *x; |
| const cl_ushort *r; |
| const cl_ushort *s; |
| f2h f; |
| const char *aspace; |
| cl_uint lim; |
| cl_uint count; |
| int vsz; |
| } CheckResultInfoF; |
| |
| typedef struct CheckResultInfoD_ |
| { |
| const double *x; |
| const cl_ushort *r; |
| const cl_ushort *s; |
| d2h f; |
| const char *aspace; |
| cl_uint lim; |
| cl_uint count; |
| int vsz; |
| } CheckResultInfoD; |
| |
| static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) |
| { |
| ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; |
| cl_uint lim = cri->lim; |
| cl_uint count = cri->count; |
| cl_uint off = jid * count; |
| float *x = cri->x + off; |
| cl_ushort *r = cri->r + off; |
| f2h f = cri->f; |
| cl_ulong i = cri->i + off; |
| cl_uint j; |
| |
| if (off + count > lim) count = lim - off; |
| |
| for (j = 0; j < count; ++j) |
| { |
| x[j] = as_float((cl_uint)(i + j)); |
| r[j] = f(x[j]); |
| } |
| |
| return 0; |
| } |
| |
| static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo) |
| { |
| CheckResultInfoF *cri = (CheckResultInfoF *)userInfo; |
| cl_uint lim = cri->lim; |
| cl_uint count = cri->count; |
| cl_uint off = jid * count; |
| const float *x = cri->x + off; |
| const cl_ushort *r = cri->r + off; |
| const cl_ushort *s = cri->s + off; |
| f2h f = cri->f; |
| cl_uint j; |
| cl_ushort correct2 = f(0.0f); |
| cl_ushort correct3 = f(-0.0f); |
| cl_int ret = 0; |
| |
| if (off + count > lim) count = lim - off; |
| |
| if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0; |
| |
| for (j = 0; j < count; j++) |
| { |
| if (s[j] == r[j]) continue; |
| |
| // Pass any NaNs |
| if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue; |
| |
| // retry per section 6.5.3.3 |
| if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3)) |
| continue; |
| |
| // if reference result is subnormal, pass any zero |
| if (gIsEmbedded && IsHalfSubnormal(r[j]) |
| && (s[j] == 0x0000 || s[j] == 0x8000)) |
| continue; |
| |
| vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, " |
| "vector_size = %d, address_space = %s\n", |
| j + off, x[j], r[j], s[j], cri->vsz, cri->aspace); |
| |
| ret = 1; |
| break; |
| } |
| |
| return ret; |
| } |
| |
| static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) |
| { |
| ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; |
| cl_uint lim = cri->lim; |
| cl_uint count = cri->count; |
| cl_uint off = jid * count; |
| double *x = cri->x + off; |
| cl_ushort *r = cri->r + off; |
| d2h f = cri->f; |
| cl_uint j; |
| cl_ulong i = cri->i + off; |
| |
| if (off + count > lim) count = lim - off; |
| |
| for (j = 0; j < count; ++j) |
| { |
| x[j] = as_double(DoubleFromUInt((cl_uint)(i + j))); |
| r[j] = f(x[j]); |
| } |
| |
| return 0; |
| } |
| |
| static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo) |
| { |
| CheckResultInfoD *cri = (CheckResultInfoD *)userInfo; |
| cl_uint lim = cri->lim; |
| cl_uint count = cri->count; |
| cl_uint off = jid * count; |
| const double *x = cri->x + off; |
| const cl_ushort *r = cri->r + off; |
| const cl_ushort *s = cri->s + off; |
| d2h f = cri->f; |
| cl_uint j; |
| cl_ushort correct2 = f(0.0); |
| cl_ushort correct3 = f(-0.0); |
| cl_int ret = 0; |
| |
| if (off + count > lim) count = lim - off; |
| |
| if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0; |
| |
| for (j = 0; j < count; j++) |
| { |
| if (s[j] == r[j]) continue; |
| |
| // Pass any NaNs |
| if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue; |
| |
| if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3)) |
| continue; |
| |
| // if reference result is subnormal, pass any zero result |
| if (gIsEmbedded && IsHalfSubnormal(r[j]) |
| && (s[j] == 0x0000 || s[j] == 0x8000)) |
| continue; |
| |
| vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, " |
| "vector_size = %d, address space = %s (double precision)\n", |
| j + off, x[j], r[j], s[j], cri->vsz, cri->aspace); |
| |
| ret = 1; |
| break; |
| } |
| |
| return ret; |
| } |
| |
| static cl_half float2half_rte(float f) |
| { |
| return cl_half_from_float(f, CL_HALF_RTE); |
| } |
| |
| static cl_half float2half_rtz(float f) |
| { |
| return cl_half_from_float(f, CL_HALF_RTZ); |
| } |
| |
| static cl_half float2half_rtp(float f) |
| { |
| return cl_half_from_float(f, CL_HALF_RTP); |
| } |
| |
| static cl_half float2half_rtn(float f) |
| { |
| return cl_half_from_float(f, CL_HALF_RTN); |
| } |
| |
| static cl_half double2half_rte(double f) |
| { |
| return cl_half_from_double(f, CL_HALF_RTE); |
| } |
| |
| static cl_half double2half_rtz(double f) |
| { |
| return cl_half_from_double(f, CL_HALF_RTZ); |
| } |
| |
| static cl_half double2half_rtp(double f) |
| { |
| return cl_half_from_double(f, CL_HALF_RTP); |
| } |
| |
| static cl_half double2half_rtn(double f) |
| { |
| return cl_half_from_double(f, CL_HALF_RTN); |
| } |
| |
| REGISTER_TEST(vstore_half) |
| { |
| switch (get_default_rounding_mode(device)) |
| { |
| case CL_FP_ROUND_TO_ZERO: |
| return Test_vStoreHalf_private(device, float2half_rtz, |
| double2half_rte, ""); |
| case 0: return -1; |
| default: |
| return Test_vStoreHalf_private(device, float2half_rte, |
| double2half_rte, ""); |
| } |
| } |
| |
| REGISTER_TEST(vstore_half_rte) |
| { |
| return Test_vStoreHalf_private(device, float2half_rte, double2half_rte, |
| "_rte"); |
| } |
| |
| REGISTER_TEST(vstore_half_rtz) |
| { |
| return Test_vStoreHalf_private(device, float2half_rtz, double2half_rtz, |
| "_rtz"); |
| } |
| |
| REGISTER_TEST(vstore_half_rtp) |
| { |
| return Test_vStoreHalf_private(device, float2half_rtp, double2half_rtp, |
| "_rtp"); |
| } |
| |
| REGISTER_TEST(vstore_half_rtn) |
| { |
| return Test_vStoreHalf_private(device, float2half_rtn, double2half_rtn, |
| "_rtn"); |
| } |
| |
| REGISTER_TEST(vstorea_half) |
| { |
| switch (get_default_rounding_mode(device)) |
| { |
| case CL_FP_ROUND_TO_ZERO: |
| return Test_vStoreaHalf_private(device, float2half_rtz, |
| double2half_rte, ""); |
| case 0: return -1; |
| default: |
| return Test_vStoreaHalf_private(device, float2half_rte, |
| double2half_rte, ""); |
| } |
| } |
| |
| REGISTER_TEST(vstorea_half_rte) |
| { |
| return Test_vStoreaHalf_private(device, float2half_rte, double2half_rte, |
| "_rte"); |
| } |
| |
| REGISTER_TEST(vstorea_half_rtz) |
| { |
| return Test_vStoreaHalf_private(device, float2half_rtz, double2half_rtz, |
| "_rtz"); |
| } |
| |
| REGISTER_TEST(vstorea_half_rtp) |
| { |
| return Test_vStoreaHalf_private(device, float2half_rtp, double2half_rtp, |
| "_rtp"); |
| } |
| |
| REGISTER_TEST(vstorea_half_rtn) |
| { |
| return Test_vStoreaHalf_private(device, float2half_rtn, double2half_rtn, |
| "_rtn"); |
| } |
| |
| #pragma mark - |
| |
| int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc, |
| d2h doubleReferenceFunc, const char *roundName) |
| { |
| int vectorSize, error; |
| cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_program resetProgram = nullptr; |
| cl_kernel resetKernel = nullptr; |
| |
| uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| memset(min_time, -1, sizeof(min_time)); |
| cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = { |
| 0 |
| }; |
| memset(min_double_time, -1, sizeof(min_double_time)); |
| |
| bool aligned = false; |
| |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| const char *source[] = { "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], i, f );\n" |
| "}\n" }; |
| |
| const char *source_v3[] = { |
| "__kernel void test( __global float *p, __global half *f,\n" |
| " uint extra_last_thread)\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " } " |
| " vstore_half3", |
| roundName, |
| "( vload3(i, p-adjust), i, f-adjust );\n" |
| "}\n" |
| }; |
| |
| const char *source_private_store[] = { |
| "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __private ushort data[16];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t offset = 0;\n" |
| " size_t vecsize = vec_step(p[i]);\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], 0, (__private half *)(&data[0]) );\n" |
| " for(offset = 0; offset < vecsize; offset++)\n" |
| " {\n" |
| " vstore_half(vload_half(offset, (__private half *)data), 0, " |
| "&f[vecsize*i+offset]);\n" |
| " }\n" |
| "}\n" |
| }; |
| |
| |
| const char *source_private_store_v3[] = { |
| "__kernel void test( __global float *p, __global half *f,\n" |
| " uint extra_last_thread )\n" |
| "{\n" |
| " __private ushort data[4];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " size_t offset = 0;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " } " |
| " vstore_half3", |
| roundName, |
| "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" |
| " for(offset = 0; offset < 3; offset++)\n" |
| " {\n" |
| " vstore_half(vload_half(offset, (__private half *) data), " |
| "0, &f[3*i+offset-adjust]);\n" |
| " }\n" |
| "}\n" |
| }; |
| |
| char local_buf_size[10]; |
| sprintf(local_buf_size, "%zu", gWorkGroupSize); |
| |
| |
| const char *source_local_store[] = { |
| "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local ushort data[16*", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " size_t lsize = get_local_size(0);\n" |
| " size_t vecsize = vec_step(p[0]);\n" |
| " event_t async_event;\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], lid, (__local half *)(&data[0]) );\n" |
| " barrier( CLK_LOCAL_MEM_FENCE ); \n" |
| " async_event = async_work_group_copy((__global ushort " |
| "*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, " |
| "0);\n" // investigate later |
| " wait_group_events(1, &async_event);\n" |
| "}\n" |
| }; |
| |
| const char *source_local_store_v3[] = { |
| "__kernel void test( __global float *p, __global half *f,\n" |
| " uint extra_last_thread )\n" |
| "{\n" |
| " __local ushort data[3*(", |
| local_buf_size, |
| "+1)];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " size_t lsize = get_local_size(0);\n" |
| " event_t async_event;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " } " |
| " vstore_half3", |
| roundName, |
| "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" |
| " barrier( CLK_LOCAL_MEM_FENCE ); \n" |
| " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n" |
| " extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " }\n" |
| " async_event = async_work_group_copy(\n" |
| " (__global ushort*)(f+3*(i-lid)),\n" |
| " (__local ushort *)(&data[adjust]),\n" |
| " lsize*3-adjust, 0);\n" // investigate later |
| " wait_group_events(1, &async_event);\n" |
| "}\n" |
| }; |
| |
| const char *double_source[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], i, f );\n" |
| "}\n" |
| }; |
| |
| const char *double_source_private_store[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __private ushort data[16];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t offset = 0;\n" |
| " size_t vecsize = vec_step(p[i]);\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], 0, (__private half *)(&data[0]) );\n" |
| " for(offset = 0; offset < vecsize; offset++)\n" |
| " {\n" |
| " vstore_half(vload_half(offset, (__private half *)data), 0, " |
| "&f[vecsize*i+offset]);\n" |
| " }\n" |
| "}\n" |
| }; |
| |
| |
| const char *double_source_local_store[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local ushort data[16*", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " size_t vecsize = vec_step(p[0]);\n" |
| " size_t lsize = get_local_size(0);\n" |
| " event_t async_event;\n" |
| " vstore_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], lid, (__local half *)(&data[0]) );\n" |
| " barrier( CLK_LOCAL_MEM_FENCE ); \n" |
| " async_event = async_work_group_copy((__global ushort " |
| "*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), " |
| "vecsize*lsize, 0);\n" // investigate later |
| " wait_group_events(1, &async_event);\n" |
| "}\n" |
| }; |
| |
| |
| const char *double_source_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double *p, __global half *f ,\n" |
| " uint extra_last_thread)\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " } " |
| " vstore_half3", |
| roundName, |
| "( vload3(i,p-adjust), i, f -adjust);\n" |
| "}\n" |
| }; |
| |
| const char *double_source_private_store_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double *p, __global half *f,\n" |
| " uint extra_last_thread )\n" |
| "{\n" |
| " __private ushort data[4];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " size_t offset = 0;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " } " |
| " vstore_half3", |
| roundName, |
| "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" |
| " for(offset = 0; offset < 3; offset++)\n" |
| " {\n" |
| " vstore_half(vload_half(offset, (__private half *)data), 0, " |
| "&f[3*i+offset-adjust]);\n" |
| " }\n" |
| "}\n" |
| }; |
| |
| const char *double_source_local_store_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double *p, __global half *f,\n" |
| " uint extra_last_thread )\n" |
| "{\n" |
| " __local ushort data[3*(", |
| local_buf_size, |
| "+1)];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " size_t last_i = get_global_size(0)-1;\n" |
| " size_t adjust = 0;\n" |
| " size_t lsize = get_local_size(0);\n" |
| " event_t async_event;\n" |
| " if(last_i == i && extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " }\n " |
| " vstore_half3", |
| roundName, |
| "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" |
| " barrier( CLK_LOCAL_MEM_FENCE ); \n" |
| " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n" |
| " extra_last_thread != 0) {\n" |
| " adjust = 3-extra_last_thread;\n" |
| " }\n" |
| " async_event = async_work_group_copy(\n" |
| " (__global ushort *)(f+3*(i-lid)),\n" |
| " (__local ushort *)(&data[adjust]),\n" |
| " lsize*3-adjust, 0);\n" // investigate later |
| " wait_group_events(1, &async_event);\n" |
| "}\n" |
| }; |
| |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][0] = MakeProgram( |
| device, source_v3, sizeof(source_v3) / sizeof(source_v3[0])); |
| } |
| else |
| { |
| programs[vectorSize][0] = |
| MakeProgram(device, source, sizeof(source) / sizeof(source[0])); |
| } |
| if (NULL == programs[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| |
| kernels[vectorSize][0] = |
| clCreateKernel(programs[vectorSize][0], "test", &error); |
| if (NULL == kernels[vectorSize][0]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][1] = |
| MakeProgram(device, source_private_store_v3, |
| sizeof(source_private_store_v3) |
| / sizeof(source_private_store_v3[0])); |
| } |
| else |
| { |
| programs[vectorSize][1] = MakeProgram( |
| device, source_private_store, |
| sizeof(source_private_store) / sizeof(source_private_store[0])); |
| } |
| if (NULL == programs[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| |
| kernels[vectorSize][1] = |
| clCreateKernel(programs[vectorSize][1], "test", &error); |
| if (NULL == kernels[vectorSize][1]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][2] = |
| MakeProgram(device, source_local_store_v3, |
| sizeof(source_local_store_v3) |
| / sizeof(source_local_store_v3[0])); |
| if (NULL == programs[vectorSize][2]) |
| { |
| unsigned q; |
| for (q = 0; q < sizeof(source_local_store_v3) |
| / sizeof(source_local_store_v3[0]); |
| q++) |
| vlog_error("%s", source_local_store_v3[q]); |
| |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| programs[vectorSize][2] = MakeProgram( |
| device, source_local_store, |
| sizeof(source_local_store) / sizeof(source_local_store[0])); |
| if (NULL == programs[vectorSize][2]) |
| { |
| unsigned q; |
| for (q = 0; q < sizeof(source_local_store) |
| / sizeof(source_local_store[0]); |
| q++) |
| vlog_error("%s", source_local_store[q]); |
| |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| kernels[vectorSize][2] = |
| clCreateKernel(programs[vectorSize][2], "test", &error); |
| if (NULL == kernels[vectorSize][2]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (gTestDouble) |
| { |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| doublePrograms[vectorSize][0] = MakeProgram( |
| device, double_source_v3, |
| sizeof(double_source_v3) / sizeof(double_source_v3[0])); |
| } |
| else |
| { |
| doublePrograms[vectorSize][0] = MakeProgram( |
| device, double_source, |
| sizeof(double_source) / sizeof(double_source[0])); |
| } |
| if (NULL == doublePrograms[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| |
| doubleKernels[vectorSize][0] = |
| clCreateKernel(doublePrograms[vectorSize][0], "test", &error); |
| if (NULL == kernels[vectorSize][0]) |
| { |
| gFailCount++; |
| vlog_error( |
| "\t\tFAILED -- Failed to create double kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| doublePrograms[vectorSize][1] = MakeProgram( |
| device, double_source_private_store_v3, |
| sizeof(double_source_private_store_v3) |
| / sizeof(double_source_private_store_v3[0])); |
| else |
| doublePrograms[vectorSize][1] = |
| MakeProgram(device, double_source_private_store, |
| sizeof(double_source_private_store) |
| / sizeof(double_source_private_store[0])); |
| |
| if (NULL == doublePrograms[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| |
| doubleKernels[vectorSize][1] = |
| clCreateKernel(doublePrograms[vectorSize][1], "test", &error); |
| if (NULL == kernels[vectorSize][1]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create double private " |
| "kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| doublePrograms[vectorSize][2] = |
| MakeProgram(device, double_source_local_store_v3, |
| sizeof(double_source_local_store_v3) |
| / sizeof(double_source_local_store_v3[0])); |
| } |
| else |
| { |
| doublePrograms[vectorSize][2] = |
| MakeProgram(device, double_source_local_store, |
| sizeof(double_source_local_store) |
| / sizeof(double_source_local_store[0])); |
| } |
| if (NULL == doublePrograms[vectorSize][2]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| |
| doubleKernels[vectorSize][2] = |
| clCreateKernel(doublePrograms[vectorSize][2], "test", &error); |
| if (NULL == kernels[vectorSize][2]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create double local " |
| "kernel. (%d)\n", |
| error); |
| return error; |
| } |
| } |
| } // end for vector size |
| |
| const char *reset[] = { |
| "__kernel void reset( __global float *p, __global ushort *f,\n" |
| " uint extra_last_thread)\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " *(f + i) = 0xdead;" |
| "}\n" |
| }; |
| |
| if (!gHostReset) |
| { |
| resetProgram = |
| MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0])); |
| if (NULL == resetProgram) |
| { |
| gFailCount++; |
| return -1; |
| } |
| resetKernel = clCreateKernel(resetProgram, "reset", &error); |
| if (NULL == resetKernel) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| // Figure out how many elements are in a work block |
| size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); |
| size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2 |
| uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats. |
| size_t stride = blockCount; |
| |
| if (gWimpyMode) |
| stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor; |
| |
| // we handle 64-bit types a bit differently. |
| if (lastCase == 0) lastCase = 0x100000000ULL; |
| |
| uint64_t i, j; |
| error = 0; |
| uint64_t printMask = (lastCase >> 4) - 1; |
| cl_uint count = 0; |
| int addressSpace; |
| size_t loopCount; |
| cl_uint threadCount = GetThreadCount(); |
| |
| ComputeReferenceInfoF fref; |
| fref.x = (float *)gIn_single; |
| fref.r = (cl_half *)gOut_half_reference; |
| fref.f = referenceFunc; |
| fref.lim = blockCount; |
| fref.count = (blockCount + threadCount - 1) / threadCount; |
| |
| CheckResultInfoF fchk; |
| fchk.x = (const float *)gIn_single; |
| fchk.r = (const cl_half *)gOut_half_reference; |
| fchk.s = (const cl_half *)gOut_half; |
| fchk.f = referenceFunc; |
| fchk.lim = blockCount; |
| fchk.count = (blockCount + threadCount - 1) / threadCount; |
| |
| ComputeReferenceInfoD dref; |
| dref.x = (double *)gIn_double; |
| dref.r = (cl_half *)gOut_half_reference_double; |
| dref.f = doubleReferenceFunc; |
| dref.lim = blockCount; |
| dref.count = (blockCount + threadCount - 1) / threadCount; |
| |
| CheckResultInfoD dchk; |
| dchk.x = (const double *)gIn_double; |
| dchk.r = (const cl_half *)gOut_half_reference_double; |
| dchk.s = (const cl_half *)gOut_half; |
| dchk.f = doubleReferenceFunc; |
| dchk.lim = blockCount; |
| dchk.count = (blockCount + threadCount - 1) / threadCount; |
| |
| for (i = 0; i < lastCase; i += stride) |
| { |
| count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); |
| fref.i = i; |
| dref.i = i; |
| |
| // Compute the input and reference |
| ThreadPool_Do(ReferenceF, threadCount, &fref); |
| |
| error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, |
| count * sizeof(float), gIn_single, 0, NULL, |
| NULL); |
| if (error) |
| { |
| vlog_error("Failure in clWriteBuffer\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| ThreadPool_Do(ReferenceD, threadCount, &dref); |
| |
| error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, |
| count * sizeof(double), gIn_double, 0, |
| NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clWriteBuffer\n"); |
| gFailCount++; |
| goto exit; |
| } |
| } |
| |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| // Loop through vector sizes |
| fchk.vsz = g_arrVecSizes[vectorSize]; |
| dchk.vsz = g_arrVecSizes[vectorSize]; |
| |
| for (addressSpace = 0; addressSpace < 3; addressSpace++) |
| { |
| // Loop over address spaces |
| fchk.aspace = addressSpaceNames[addressSpace]; |
| dchk.aspace = addressSpaceNames[addressSpace]; |
| |
| if (!gHostReset) |
| { |
| error = RunKernel(device, resetKernel, gInBuffer_single, |
| gOutBuffer_half, count, 0); |
| } |
| else |
| { |
| cl_uint pattern = 0xdeaddead; |
| memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); |
| |
| error = clEnqueueWriteBuffer( |
| gQueue, gOutBuffer_half, CL_FALSE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| } |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = RunKernel(device, kernels[vectorSize][addressSpace], |
| gInBuffer_single, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, |
| count * sizeof(cl_half), gOut_half, |
| 0, NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clReadArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = ThreadPool_Do(CheckF, threadCount, &fchk); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| |
| if (!gHostReset) |
| { |
| error = RunKernel(device, resetKernel, gInBuffer_double, |
| gOutBuffer_half, count, 0); |
| } |
| else |
| { |
| cl_uint pattern = 0xdeaddead; |
| memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); |
| |
| error = clEnqueueWriteBuffer( |
| gQueue, gOutBuffer_half, CL_FALSE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| } |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = RunKernel(device, |
| doubleKernels[vectorSize][addressSpace], |
| gInBuffer_double, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = clEnqueueReadBuffer( |
| gQueue, gOutBuffer_half, CL_TRUE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clReadArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| |
| error = ThreadPool_Do(CheckD, threadCount, &dchk); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| } |
| } |
| } |
| |
| if (((i + blockCount) & ~printMask) == (i + blockCount)) |
| { |
| vlog("."); |
| fflush(stdout); |
| } |
| } // end last case |
| |
| loopCount = count == blockCount ? 1 : 100; |
| if (gReportTimes) |
| { |
| // Init the input stream |
| cl_float *p = (cl_float *)gIn_single; |
| for (j = 0; j < count; j++) |
| p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); |
| |
| if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, |
| count * sizeof(float), gIn_single, 0, |
| NULL, NULL))) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| // Init the input stream |
| cl_double *q = (cl_double *)gIn_double; |
| for (j = 0; j < count; j++) |
| q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); |
| |
| if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, |
| 0, count * sizeof(double), |
| gIn_double, 0, NULL, NULL))) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| } |
| |
| // Run again for timing |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| uint64_t bestTime = -1ULL; |
| for (j = 0; j < loopCount; j++) |
| { |
| uint64_t startTime = ReadTime(); |
| |
| |
| if ((error = RunKernel(device, kernels[vectorSize][0], |
| gInBuffer_single, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)))) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if ((error = clFinish(gQueue))) |
| { |
| vlog_error("Failure in clFinish\n"); |
| gFailCount++; |
| goto exit; |
| } |
| uint64_t currentTime = ReadTime() - startTime; |
| if (currentTime < bestTime) bestTime = currentTime; |
| time[vectorSize] += currentTime; |
| } |
| if (bestTime < min_time[vectorSize]) |
| min_time[vectorSize] = bestTime; |
| |
| if (gTestDouble) |
| { |
| bestTime = -1ULL; |
| for (j = 0; j < loopCount; j++) |
| { |
| uint64_t startTime = ReadTime(); |
| if ((error = |
| RunKernel(device, doubleKernels[vectorSize][0], |
| gInBuffer_double, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)))) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if ((error = clFinish(gQueue))) |
| { |
| vlog_error("Failure in clFinish\n"); |
| gFailCount++; |
| goto exit; |
| } |
| uint64_t currentTime = ReadTime() - startTime; |
| if (currentTime < bestTime) bestTime = currentTime; |
| doubleTime[vectorSize] += currentTime; |
| } |
| if (bestTime < min_double_time[vectorSize]) |
| min_double_time[vectorSize] = bestTime; |
| } |
| } |
| } |
| |
| if (gReportTimes) |
| { |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency |
| * gComputeDevices / (double)(count * loopCount), |
| 0, "average us/elem", |
| "vStoreHalf%s avg. (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices / (double)count, |
| 0, "best us/elem", |
| "vStoreHalf%s best (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| if (gTestDouble) |
| { |
| for (vectorSize = kMinVectorSize; |
| vectorSize < kLastVectorSizeToTest; vectorSize++) |
| vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices |
| / (double)(count * loopCount), |
| 0, "average us/elem (double)", |
| "vStoreHalf%s avg. d (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| for (vectorSize = kMinVectorSize; |
| vectorSize < kLastVectorSizeToTest; vectorSize++) |
| vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices |
| / (double)count, |
| 0, "best us/elem (double)", |
| "vStoreHalf%s best d (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| } |
| } |
| |
| exit: |
| // clean up |
| if (!gHostReset) |
| { |
| clReleaseKernel(resetKernel); |
| clReleaseProgram(resetProgram); |
| } |
| |
| for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| for (addressSpace = 0; addressSpace < 3; addressSpace++) |
| { |
| clReleaseKernel(kernels[vectorSize][addressSpace]); |
| clReleaseProgram(programs[vectorSize][addressSpace]); |
| if (gTestDouble) |
| { |
| clReleaseKernel(doubleKernels[vectorSize][addressSpace]); |
| clReleaseProgram(doublePrograms[vectorSize][addressSpace]); |
| } |
| } |
| } |
| |
| return error; |
| } |
| |
| int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc, |
| d2h doubleReferenceFunc, const char *roundName) |
| { |
| int vectorSize, error; |
| cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_program resetProgram = nullptr; |
| cl_kernel resetKernel = nullptr; |
| |
| uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| memset(min_time, -1, sizeof(min_time)); |
| cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; |
| uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; |
| uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = { |
| 0 |
| }; |
| memset(min_double_time, -1, sizeof(min_double_time)); |
| |
| bool aligned = true; |
| |
| int minVectorSize = kMinVectorSize; |
| // There is no aligned scalar vstorea_half |
| if (0 == minVectorSize) minVectorSize = 1; |
| |
| // Loop over vector sizes |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| const char *source[] = { "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], i, f );\n" |
| "}\n" }; |
| |
| const char *source_v3[] = { |
| "__kernel void test( __global float3 *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstorea_half3", |
| roundName, |
| "( p[i], i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| const char *source_private[] = { |
| "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __private float", |
| vector_size_name_extensions[vectorSize], |
| " data;\n" |
| " size_t i = get_global_id(0);\n" |
| " data = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data, i, f );\n" |
| "}\n" |
| }; |
| |
| const char *source_private_v3[] = { |
| "__kernel void test( __global float3 *p, __global half *f )\n" |
| "{\n" |
| " __private float", |
| vector_size_name_extensions[vectorSize], |
| " data;\n" |
| " size_t i = get_global_id(0);\n" |
| " data = p[i];\n" |
| " vstorea_half3", |
| roundName, |
| "( data, i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| char local_buf_size[10]; |
| sprintf(local_buf_size, "%zu", gWorkGroupSize); |
| const char *source_local[] = { "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local float", |
| vector_size_name_extensions[vectorSize], |
| " data[", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " data[lid] = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data[lid], i, f );\n" |
| "}\n" }; |
| |
| const char *source_local_v3[] = { |
| "__kernel void test( __global float", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local float", |
| vector_size_name_extensions[vectorSize], |
| " data[", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " data[lid] = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data[lid], i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| const char *double_source[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], i, f );\n" |
| "}\n" |
| }; |
| |
| const char *double_source_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( p[i], i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| const char *double_source_private[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __private double", |
| vector_size_name_extensions[vectorSize], |
| " data;\n" |
| " size_t i = get_global_id(0);\n" |
| " data = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data, i, f );\n" |
| "}\n" |
| }; |
| |
| const char *double_source_private_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __private double", |
| vector_size_name_extensions[vectorSize], |
| " data;\n" |
| " size_t i = get_global_id(0);\n" |
| " data = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data, i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| const char *double_source_local[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local double", |
| vector_size_name_extensions[vectorSize], |
| " data[", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " data[lid] = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data[lid], i, f );\n" |
| "}\n" |
| }; |
| |
| const char *double_source_local_v3[] = { |
| "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" |
| "__kernel void test( __global double", |
| vector_size_name_extensions[vectorSize], |
| " *p, __global half *f )\n" |
| "{\n" |
| " __local double", |
| vector_size_name_extensions[vectorSize], |
| " data[", |
| local_buf_size, |
| "];\n" |
| " size_t i = get_global_id(0);\n" |
| " size_t lid = get_local_id(0);\n" |
| " data[lid] = p[i];\n" |
| " vstorea_half", |
| vector_size_name_extensions[vectorSize], |
| roundName, |
| "( data[lid], i, f );\n" |
| " vstore_half", |
| roundName, |
| "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" |
| "}\n" |
| }; |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][0] = MakeProgram( |
| device, source_v3, sizeof(source_v3) / sizeof(source_v3[0])); |
| if (NULL == programs[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| programs[vectorSize][0] = |
| MakeProgram(device, source, sizeof(source) / sizeof(source[0])); |
| if (NULL == programs[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| kernels[vectorSize][0] = |
| clCreateKernel(programs[vectorSize][0], "test", &error); |
| if (NULL == kernels[vectorSize][0]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][1] = MakeProgram( |
| device, source_private_v3, |
| sizeof(source_private_v3) / sizeof(source_private_v3[0])); |
| if (NULL == programs[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| programs[vectorSize][1] = |
| MakeProgram(device, source_private, |
| sizeof(source_private) / sizeof(source_private[0])); |
| if (NULL == programs[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| kernels[vectorSize][1] = |
| clCreateKernel(programs[vectorSize][1], "test", &error); |
| if (NULL == kernels[vectorSize][1]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| programs[vectorSize][2] = MakeProgram( |
| device, source_local_v3, |
| sizeof(source_local_v3) / sizeof(source_local_v3[0])); |
| if (NULL == programs[vectorSize][2]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| programs[vectorSize][2] = |
| MakeProgram(device, source_local, |
| sizeof(source_local) / sizeof(source_local[0])); |
| if (NULL == programs[vectorSize][2]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| kernels[vectorSize][2] = |
| clCreateKernel(programs[vectorSize][2], "test", &error); |
| if (NULL == kernels[vectorSize][2]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (gTestDouble) |
| { |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| doublePrograms[vectorSize][0] = MakeProgram( |
| device, double_source_v3, |
| sizeof(double_source_v3) / sizeof(double_source_v3[0])); |
| if (NULL == doublePrograms[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| doublePrograms[vectorSize][0] = MakeProgram( |
| device, double_source, |
| sizeof(double_source) / sizeof(double_source[0])); |
| if (NULL == doublePrograms[vectorSize][0]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| doubleKernels[vectorSize][0] = |
| clCreateKernel(doublePrograms[vectorSize][0], "test", &error); |
| if (NULL == kernels[vectorSize][0]) |
| { |
| gFailCount++; |
| vlog_error( |
| "\t\tFAILED -- Failed to create double kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| doublePrograms[vectorSize][1] = |
| MakeProgram(device, double_source_private_v3, |
| sizeof(double_source_private_v3) |
| / sizeof(double_source_private_v3[0])); |
| if (NULL == doublePrograms[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| doublePrograms[vectorSize][1] = |
| MakeProgram(device, double_source_private, |
| sizeof(double_source_private) |
| / sizeof(double_source_private[0])); |
| if (NULL == doublePrograms[vectorSize][1]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| doubleKernels[vectorSize][1] = |
| clCreateKernel(doublePrograms[vectorSize][1], "test", &error); |
| if (NULL == kernels[vectorSize][1]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create double private " |
| "kernel. (%d)\n", |
| error); |
| return error; |
| } |
| |
| if (g_arrVecSizes[vectorSize] == 3) |
| { |
| doublePrograms[vectorSize][2] = |
| MakeProgram(device, double_source_local_v3, |
| sizeof(double_source_local_v3) |
| / sizeof(double_source_local_v3[0])); |
| if (NULL == doublePrograms[vectorSize][2]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| else |
| { |
| doublePrograms[vectorSize][2] = |
| MakeProgram(device, double_source_local, |
| sizeof(double_source_local) |
| / sizeof(double_source_local[0])); |
| if (NULL == doublePrograms[vectorSize][2]) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| doubleKernels[vectorSize][2] = |
| clCreateKernel(doublePrograms[vectorSize][2], "test", &error); |
| if (NULL == kernels[vectorSize][2]) |
| { |
| gFailCount++; |
| vlog_error("\t\tFAILED -- Failed to create double local " |
| "kernel. (%d)\n", |
| error); |
| return error; |
| } |
| } |
| } |
| |
| const char *reset[] = { |
| "__kernel void reset( __global float *p, __global ushort *f,\n" |
| " uint extra_last_thread)\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " *(f + i) = 0xdead;" |
| "}\n" |
| }; |
| |
| if (!gHostReset) |
| { |
| resetProgram = |
| MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0])); |
| if (NULL == resetProgram) |
| { |
| gFailCount++; |
| return -1; |
| } |
| resetKernel = clCreateKernel(resetProgram, "reset", &error); |
| if (NULL == resetKernel) |
| { |
| gFailCount++; |
| return -1; |
| } |
| } |
| |
| // Figure out how many elements are in a work block |
| size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); |
| size_t blockCount = BUFFER_SIZE / elementSize; |
| uint64_t lastCase = 1ULL << (8 * sizeof(float)); |
| size_t stride = blockCount; |
| |
| if (gWimpyMode) |
| stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor; |
| |
| // we handle 64-bit types a bit differently. |
| if (lastCase == 0) lastCase = 0x100000000ULL; |
| uint64_t i, j; |
| error = 0; |
| uint64_t printMask = (lastCase >> 4) - 1; |
| cl_uint count = 0; |
| int addressSpace; |
| size_t loopCount; |
| cl_uint threadCount = GetThreadCount(); |
| |
| ComputeReferenceInfoF fref; |
| fref.x = (float *)gIn_single; |
| fref.r = (cl_half *)gOut_half_reference; |
| fref.f = referenceFunc; |
| fref.lim = blockCount; |
| fref.count = (blockCount + threadCount - 1) / threadCount; |
| |
| CheckResultInfoF fchk; |
| fchk.x = (const float *)gIn_single; |
| fchk.r = (const cl_half *)gOut_half_reference; |
| fchk.s = (const cl_half *)gOut_half; |
| fchk.f = referenceFunc; |
| fchk.lim = blockCount; |
| fchk.count = (blockCount + threadCount - 1) / threadCount; |
| |
| ComputeReferenceInfoD dref; |
| dref.x = (double *)gIn_double; |
| dref.r = (cl_half *)gOut_half_reference_double; |
| dref.f = doubleReferenceFunc; |
| dref.lim = blockCount; |
| dref.count = (blockCount + threadCount - 1) / threadCount; |
| |
| CheckResultInfoD dchk; |
| dchk.x = (const double *)gIn_double; |
| dchk.r = (const cl_half *)gOut_half_reference_double; |
| dchk.s = (const cl_half *)gOut_half; |
| dchk.f = doubleReferenceFunc; |
| dchk.lim = blockCount; |
| dchk.count = (blockCount + threadCount - 1) / threadCount; |
| |
| for (i = 0; i < (uint64_t)lastCase; i += stride) |
| { |
| count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); |
| fref.i = i; |
| dref.i = i; |
| |
| // Create the input and reference |
| ThreadPool_Do(ReferenceF, threadCount, &fref); |
| |
| error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, |
| count * sizeof(float), gIn_single, 0, NULL, |
| NULL); |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| ThreadPool_Do(ReferenceD, threadCount, &dref); |
| |
| error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, |
| count * sizeof(double), gIn_double, 0, |
| NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| } |
| |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| // Loop over vector legths |
| fchk.vsz = g_arrVecSizes[vectorSize]; |
| dchk.vsz = g_arrVecSizes[vectorSize]; |
| |
| for (addressSpace = 0; addressSpace < 3; addressSpace++) |
| { |
| // Loop over address spaces |
| fchk.aspace = addressSpaceNames[addressSpace]; |
| dchk.aspace = addressSpaceNames[addressSpace]; |
| |
| if (!gHostReset) |
| { |
| error = RunKernel(device, resetKernel, gInBuffer_single, |
| gOutBuffer_half, count, 0); |
| } |
| else |
| { |
| cl_uint pattern = 0xdeaddead; |
| memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); |
| |
| error = clEnqueueWriteBuffer( |
| gQueue, gOutBuffer_half, CL_FALSE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| } |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = RunKernel(device, kernels[vectorSize][addressSpace], |
| gInBuffer_single, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, |
| count * sizeof(cl_half), gOut_half, |
| 0, NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clReadArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = ThreadPool_Do(CheckF, threadCount, &fchk); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| |
| if (!gHostReset) |
| { |
| error = RunKernel(device, resetKernel, gInBuffer_single, |
| gOutBuffer_half, count, 0); |
| } |
| else |
| { |
| cl_uint pattern = 0xdeaddead; |
| memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); |
| |
| error = clEnqueueWriteBuffer( |
| gQueue, gOutBuffer_half, CL_FALSE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| } |
| if (error) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = RunKernel(device, |
| doubleKernels[vectorSize][addressSpace], |
| gInBuffer_double, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = clEnqueueReadBuffer( |
| gQueue, gOutBuffer_half, CL_TRUE, 0, |
| count * sizeof(cl_half), gOut_half, 0, NULL, NULL); |
| if (error) |
| { |
| vlog_error("Failure in clReadArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| error = ThreadPool_Do(CheckD, threadCount, &dchk); |
| if (error) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| } |
| } |
| } // end for vector size |
| |
| if (((i + blockCount) & ~printMask) == (i + blockCount)) |
| { |
| vlog("."); |
| fflush(stdout); |
| } |
| } // for end lastcase |
| |
| loopCount = count == blockCount ? 1 : 100; |
| if (gReportTimes) |
| { |
| // Init the input stream |
| cl_float *p = (cl_float *)gIn_single; |
| for (j = 0; j < count; j++) |
| p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); |
| |
| if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, |
| count * sizeof(float), gIn_single, 0, |
| NULL, NULL))) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if (gTestDouble) |
| { |
| // Init the input stream |
| cl_double *q = (cl_double *)gIn_double; |
| for (j = 0; j < count; j++) |
| q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); |
| |
| if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, |
| 0, count * sizeof(double), |
| gIn_double, 0, NULL, NULL))) |
| { |
| vlog_error("Failure in clWriteArray\n"); |
| gFailCount++; |
| goto exit; |
| } |
| } |
| |
| // Run again for timing |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| uint64_t bestTime = -1ULL; |
| for (j = 0; j < loopCount; j++) |
| { |
| uint64_t startTime = ReadTime(); |
| if ((error = RunKernel(device, kernels[vectorSize][0], |
| gInBuffer_single, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)))) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if ((error = clFinish(gQueue))) |
| { |
| vlog_error("Failure in clFinish\n"); |
| gFailCount++; |
| goto exit; |
| } |
| uint64_t currentTime = ReadTime() - startTime; |
| if (currentTime < bestTime) bestTime = currentTime; |
| time[vectorSize] += currentTime; |
| } |
| if (bestTime < min_time[vectorSize]) |
| min_time[vectorSize] = bestTime; |
| |
| if (gTestDouble) |
| { |
| bestTime = -1ULL; |
| for (j = 0; j < loopCount; j++) |
| { |
| uint64_t startTime = ReadTime(); |
| if ((error = |
| RunKernel(device, doubleKernels[vectorSize][0], |
| gInBuffer_double, gOutBuffer_half, |
| numVecs(count, vectorSize, aligned), |
| runsOverBy(count, vectorSize, aligned)))) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| if ((error = clFinish(gQueue))) |
| { |
| vlog_error("Failure in clFinish\n"); |
| gFailCount++; |
| goto exit; |
| } |
| uint64_t currentTime = ReadTime() - startTime; |
| if (currentTime < bestTime) bestTime = currentTime; |
| doubleTime[vectorSize] += currentTime; |
| } |
| if (bestTime < min_double_time[vectorSize]) |
| min_double_time[vectorSize] = bestTime; |
| } |
| } |
| } |
| |
| if (gReportTimes) |
| { |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency |
| * gComputeDevices / (double)(count * loopCount), |
| 0, "average us/elem", |
| "vStoreaHalf%s avg. (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices / (double)count, |
| 0, "best us/elem", |
| "vStoreaHalf%s best (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| if (gTestDouble) |
| { |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices |
| / (double)(count * loopCount), |
| 0, "average us/elem (double)", |
| "vStoreaHalf%s avg. d (%s vector size: %d)", |
| roundName, addressSpaceNames[0], |
| (g_arrVecSizes[vectorSize])); |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| vlog_perf( |
| SubtractTime(min_double_time[vectorSize], 0) * 1e6 |
| * gDeviceFrequency * gComputeDevices / (double)count, |
| 0, "best us/elem (double)", |
| "vStoreaHalf%s best d (%s vector size: %d)", roundName, |
| addressSpaceNames[0], (g_arrVecSizes[vectorSize])); |
| } |
| } |
| |
| exit: |
| // clean up |
| if (!gHostReset) |
| { |
| clReleaseKernel(resetKernel); |
| clReleaseProgram(resetProgram); |
| } |
| |
| for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; |
| vectorSize++) |
| { |
| for (addressSpace = 0; addressSpace < 3; addressSpace++) |
| { |
| clReleaseKernel(kernels[vectorSize][addressSpace]); |
| clReleaseProgram(programs[vectorSize][addressSpace]); |
| if (gTestDouble) |
| { |
| clReleaseKernel(doubleKernels[vectorSize][addressSpace]); |
| clReleaseProgram(doublePrograms[vectorSize][addressSpace]); |
| } |
| } |
| } |
| |
| return error; |
| } |