| // Copyright 2004-present Facebook. All Rights Reserved. |
| |
| #include "opengl_test.h" |
| |
| #include "../core/GLContext.h" |
| #include "../core/GLImageAllocator.h" |
| #include "../core/GLLogging.h" |
| #include "../core/ImageAllocator.h" |
| #include "../core/arm_neon_support.h" |
| #include "../core/rewrite_net.h" |
| |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/timer.h" |
| #include "caffe2/core/workspace.h" |
| #include "caffe2/utils/proto_utils.h" |
| |
| #ifdef CAFFE2_USE_MPSCNN |
| #include "caffe2/contrib/ios/mpscnn/mpscnn.h" |
| #endif |
| |
| #define DEBUGGING false |
| |
| namespace caffe2 { |
| |
| template <class T> |
| float absolute_error(T t1, T t2) { |
| return std::abs((float)t1 - (float)t2); |
| } |
| |
| template <class T> |
| float relative_error(T t1, T t2) { |
| return t2 != 0 ? absolute_error(t1, t2) / (float)t2 : 1; |
| } |
| |
| // OpenGL: t1, CPU: t2 |
| void checkError1D(const TensorCPU& t1, const TensorCPU& t2, float error) { |
| CAFFE_ENFORCE_EQ(t1.size(), t2.size()); |
| #if DEBUGGING |
| gl_log(GL_LOG, "OpenGL output:\n"); |
| for (int i = 0; i < t1.size(); i++) { |
| gl_log(GL_LOG, "%.5f\t", t1.template data<float>()[i]); |
| } |
| gl_log(GL_LOG, "\n"); |
| gl_log(GL_LOG, "CPU output:\n"); |
| for (int i = 0; i < t2.size(); i++) { |
| gl_log(GL_LOG, "%.5f\t", t2.template data<float>()[i]); |
| } |
| gl_log(GL_LOG, "\n"); |
| |
| #else |
| int count = 0; |
| if (t1.template IsType<float>()) { |
| for (auto i = 0; i < t1.size(); ++i) { |
| const float t1_i = t1.template data<float>()[i]; |
| const float t2_i = t2.template data<float>()[i]; |
| |
| if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) { |
| gl_log(GL_ERR, |
| "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n", |
| i, |
| t1_i, |
| t2_i, |
| absolute_error(t1_i, t2_i), |
| relative_error(t1_i, t2_i) * 100); |
| if (count++ == 10) { |
| break; |
| } |
| } |
| } |
| } |
| #endif |
| } |
| |
| // OpenGL: t1, CPU: t2 |
| void checkError(const TensorCPU& t1, const TensorCPU& t2, float error) { |
| CAFFE_ENFORCE_EQ(t1.dims(), t2.dims()); |
| #if DEBUGGING |
| gl_log(GL_LOG, "opengl_test output\n"); |
| gl_log(GL_LOG, "\nOpenGL output:\n"); |
| for (int i = 0; i < t1.size(); i++) { |
| if (t1.ndim() > 2 && i % t1.dim(2) == 0) { |
| gl_log(GL_LOG, "\n"); |
| } |
| if (t1.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0) { |
| gl_log(GL_LOG, "\n"); |
| } |
| if (t1.template IsType<float>()) { |
| const float t1_i = t1.template data<float>()[i]; |
| gl_log(GL_LOG, "%.3f\t", t1_i); |
| } else if (t1.template IsType<uint8_t>()) { |
| const uint8_t t1_i = t1.template data<uint8_t>()[i]; |
| gl_log(GL_LOG, "%.3d\t", (int)t1_i); |
| } |
| } |
| |
| gl_log(GL_LOG, "\nCPU output:\n"); |
| for (int i = 0; i < t2.size(); i++) { |
| if (t2.ndim() > 2 && i % t2.dim(2) == 0) |
| gl_log(GL_LOG, "\n"); |
| if (t2.ndim() > 2 && i != 0 && i % (4 * t2.dim(2) * t2.dim(3)) == 0) |
| gl_log(GL_LOG, "\n"); |
| if (t2.template IsType<float>()) { |
| const float t2_i = t2.template data<float>()[i]; |
| gl_log(GL_LOG, "%.3f\t", t2_i); |
| } else if (t2.template IsType<uint8_t>()) { |
| const uint8_t t2_i = t2.template data<uint8_t>()[i]; |
| gl_log(GL_LOG, "%.3d\t", (int)t2_i); |
| } |
| } |
| gl_log(GL_LOG, "\n"); |
| #else |
| |
| int count = 0; |
| if (t1.template IsType<float>()) { |
| for (auto i = 0; i < t1.size(); ++i) { |
| const float t1_i = t1.template data<float>()[i]; |
| const float t2_i = t2.template data<float>()[i]; |
| if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) { |
| gl_log(GL_ERR, |
| "i: %d, GL: %.2f, CPU: %.2f, absolute error: %.2f, relative error: %.2f%%\n", |
| i, |
| t1_i, |
| t2_i, |
| absolute_error(t1_i, t2_i), |
| relative_error(t1_i, t2_i) * 100); |
| if (count++ == 10) { |
| break; |
| } |
| } |
| } |
| } else if (t1.template IsType<uint8_t>()) { |
| for (auto i = 0; i < t1.size(); ++i) { |
| const uint8_t t1_i = t1.template data<uint8_t>()[i]; |
| const uint8_t t2_i = t2.template data<uint8_t>()[i]; |
| if (!(absolute_error(t1_i, t2_i) <= error || relative_error(t1_i, t2_i) <= 0.08)) { |
| gl_log(GL_ERR, |
| "i: %d, GL: %d, CPU: %d, absolute error: %.2f, relative error: %.2f%%\n", |
| i, |
| t1_i, |
| t2_i, |
| absolute_error(t1_i, t2_i), |
| relative_error(t1_i, t2_i) * 100); |
| if (count++ == 10) { |
| break; |
| } |
| } |
| } |
| } |
| #endif |
| } |
| |
| void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1, int tile_y = 1) { |
| LOG(INFO) << "OPENGLCopyFrom/To Test"; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| |
| // Note: may overflow for half precision |
| // float *data = t->mutable_data<float>(); |
| // for (int i = 0; i < t->size(); i++) { |
| // data[i] = i; |
| // } |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("tile_x"); |
| arg.set_i(tile_x); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("tile_y"); |
| arg.set_i(tile_y); |
| } |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("X_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL |
| const auto& t2 = ws.GetBlob("X_cpu")->Get<TensorCPU>(); // CPU |
| CAFFE_ENFORCE_EQ(t1.dims(), t2.dims()); |
| |
| checkError(t1, t2, error); |
| } |
| |
| typedef enum { |
| AveragePool, |
| MaxPool, |
| Conv, |
| ConvTranspose, |
| ConvPRelu, |
| ConvTransposePRelu, |
| ConvRelu, |
| ConvTransposeRelu |
| } PoolOp; |
| |
| const char* glPoolOperationName[] = {"OpenGLAveragePool", |
| "OpenGLMaxPool", |
| "OpenGLConv", |
| "OpenGLConvTranspose", |
| "OpenGLConvPRelu", |
| "OpenGLConvTransposePRelu", |
| "OpenGLConvRelu", |
| "OpenGLConvTransposeRelu"}; |
| |
| const char* cpuPoolOperationName[] = {"AveragePool", |
| "MaxPool", |
| "Conv", |
| "ConvTranspose", |
| "Conv", |
| "ConvTranspose", |
| "Conv", |
| "ConvTranspose"}; |
| |
| void testOpenGLConv(int N, |
| int C, |
| int H, |
| int W, |
| int K, // output_channels |
| int kernel_h, |
| int kernel_w, |
| int pad, |
| int stride, |
| PoolOp poolOp, |
| float error, |
| bool random_input = true, |
| int input_batch_size = 1, |
| int output_batch_size = 1, |
| int input_tile_x = 1, |
| int input_tile_y = 1) { |
| LOG(INFO) << "OpenGL Conv Test: " |
| << "input C: " << C << ", output C: " << K << ", H: " << H << ", W: " << W |
| << ", K: " << kernel_w << "x" << kernel_h << ", P: " << pad << ", S: " << stride |
| << " Op: " << glPoolOperationName[poolOp]; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| #if 0 |
| gl_log(GL_LOG, "Input tensor:"); |
| for (int i = 0; i < t->size(); i++) { |
| const float t1_i = t->data<float>()[i]; |
| if (i % t->dim(3) == 0) |
| gl_log(GL_LOG, "\n"); |
| if (i % (4 * t->dim(2) * t->dim(3)) == 0) |
| gl_log(GL_LOG, "-------------------------------\n"); |
| gl_log(GL_LOG, "%.3f\t", t1_i); |
| } |
| gl_log(GL_LOG, "\n\n"); |
| #endif |
| } |
| |
| if (poolOp != AveragePool && poolOp != MaxPool) { |
| auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>(); |
| if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) { |
| t->Resize(K, C, kernel_h, kernel_w); |
| } else { |
| t->Resize(C, K, kernel_h, kernel_w); |
| } |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| // Set the weights to all 1s |
| // for (int i = 0; i < t->size(); i++) { |
| // data[i] = 1; |
| // } |
| |
| // Set the weights to 1s, 2s, 3s... for channel 0, 1, 2, 3... |
| int j = 0; |
| for (int i = 0; i < t->size(); i++) { |
| if (i % (C * kernel_h * kernel_w) == 0) { |
| j++; |
| } |
| data[i] = j; |
| } |
| } |
| |
| #if 0 |
| gl_log(GL_LOG, "Kernel (printing only the first line for each output channel):"); |
| for (int i = 0; i < t->size(); i++) { |
| if (i == 0 || i % (t->dim(1) * t->dim(2) * t->dim(3)) == 0) { |
| gl_log(GL_LOG, "\n"); |
| for (int j = 0; j < t->dim(3); j++) { |
| const float t1_i = t->data<float>()[i + j]; |
| gl_log(GL_LOG, "%.3f\t", t1_i); |
| } |
| } |
| } |
| gl_log(GL_LOG, "\n"); |
| #endif |
| |
| // bias |
| { |
| auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>(); |
| t->Resize(K); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| // Set bias to 1 |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = i + 1; |
| } |
| } |
| #if 0 |
| gl_log(GL_LOG, "Bias:\n"); |
| for (int i = 0; i < t->size(); i++) { |
| const float t1_i = t->data<float>()[i]; |
| gl_log(GL_LOG, "%.3f\t", t1_i); |
| } |
| gl_log(GL_LOG, "\n"); |
| #endif |
| } |
| } |
| |
| if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) { |
| auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>(); |
| t->Resize(K); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| // Set prelu scale to i + 1 |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("tile_x"); |
| arg.set_i(input_tile_x); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("tile_y"); |
| arg.set_i(input_tile_y); |
| } |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type(glPoolOperationName[poolOp]); |
| op.add_input("X_gl"); |
| if (poolOp != AveragePool && poolOp != MaxPool) { |
| op.add_input("W"); |
| op.add_input("b"); |
| } |
| if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) { |
| op.add_input("p"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("kernel"); |
| arg.set_i(kernel_h); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("pad"); |
| arg.set_i(pad); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("stride"); |
| arg.set_i(stride); |
| } |
| if (poolOp != AveragePool && poolOp != MaxPool) { |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("input_batch_size"); |
| arg.set_i(input_batch_size); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("output_batch_size"); |
| arg.set_i(output_batch_size); |
| } |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("is_last"); |
| arg.set_i(1); |
| } |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type(cpuPoolOperationName[poolOp]); |
| |
| op.add_input("X_cpu"); |
| if (poolOp != AveragePool && poolOp != MaxPool) { |
| op.add_input("W"); |
| op.add_input("b"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("kernel"); |
| arg.set_i(kernel_h); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("pad"); |
| arg.set_i(pad); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("stride"); |
| arg.set_i(stride); |
| } |
| op.add_output("Y_ref"); |
| } |
| if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) { |
| auto& op = *(netdef.add_op()); |
| op.set_type("PRelu"); |
| op.add_input("Y_ref"); |
| op.add_input("p"); |
| op.add_output("Y_ref"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| } else if (poolOp == ConvRelu || poolOp == ConvTransposeRelu) { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Relu"); |
| op.add_input("Y_ref"); |
| op.add_output("Y_ref"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // OpenGL |
| const auto& t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(t1, t2, error); |
| } |
| |
| void testOpenGLPRelu(int N, int C, int H, int W, int prelu_size, float error) { |
| LOG(INFO) << "OpenGL PRelu Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| } |
| |
| // prelu scale |
| { |
| auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>(); |
| t->Resize(prelu_size); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLPRelu"); |
| op.add_input("X_gl"); |
| op.add_input("p"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("PRelu"); |
| op.add_input("X_cpu"); |
| op.add_input("p"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLRelu(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL Relu Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLRelu"); |
| op.add_input("X_gl"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Relu"); |
| op.add_input("X_cpu"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1) { |
| LOG(INFO) << "OpenGL Add Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>(); |
| t0->Resize(N, C, H, W); |
| CPUContext ctx0; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0); |
| |
| auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>(); |
| t1->Resize(N, C, H, W); |
| CPUContext ctx1; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t1->size(), 0, 30, t1->mutable_data<float>(), &ctx1); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu0"); |
| op.add_output("X_gl0"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu1"); |
| op.add_output("X_gl1"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLAdd"); |
| op.add_input("X_gl0"); |
| op.add_input("X_gl1"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Add"); |
| op.add_input("X_cpu0"); |
| op.add_input("X_cpu1"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLConcat( |
| int N, std::vector<int> Cs, int H, int W, int batch_size = 1, float error = 0.1) { |
| LOG(INFO) << "OpenGL Concat Test " |
| << "H: " << H << ", W: " << W; |
| Workspace ws; |
| for (int i = 0; i < Cs.size(); i++) { |
| auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>(); |
| t->Resize(N, Cs[i], H, W); |
| CPUContext ctx0; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx0); |
| } |
| |
| NetDef netdef; |
| for (int i = 0; i < Cs.size(); i++) { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu" + caffe2::to_string(i)); |
| op.add_output("X_gl" + caffe2::to_string(i)); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLConcat"); |
| for (int i = 0; i < Cs.size(); i++) { |
| op.add_input("X_gl" + caffe2::to_string(i)); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("batch_size"); |
| arg.set_i(batch_size); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| op.add_output("Y_gl"); |
| op.add_output("Y_gl_mask"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Concat"); |
| for (int i = 0; i < Cs.size(); i++) { |
| op.add_input("X_cpu" + caffe2::to_string(i)); |
| } |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| op.add_output("Y_ref_mask"); |
| } |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLSigmoid(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL Sigmoid Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLSigmoid"); |
| op.add_input("X_gl"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Sigmoid"); |
| op.add_input("X_cpu"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLTanh(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL Tanh Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), -3, 3, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLTanh"); |
| op.add_input("X_gl"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Tanh"); |
| op.add_input("X_cpu"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLMul(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL Mul Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx); |
| } |
| |
| { |
| auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>(); |
| t->Resize(1); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLMul"); |
| op.add_input("X_gl"); |
| op.add_input("B"); |
| op.add_output("Y_gl"); |
| |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("broadcast"); |
| arg.set_i(1); |
| } |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Mul"); |
| op.add_input("X_cpu"); |
| op.add_input("B"); |
| |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("broadcast"); |
| arg.set_i(1); |
| } |
| |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLSoftmax(int N, int D, float error) { |
| LOG(INFO) << "OpenGL Softmax Test " |
| << "N: " << N << " D: " << D; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, D); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Reshape"); |
| op.add_input("X_cpu"); |
| op.add_output("X_reshaped"); |
| op.add_output("old_shape"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("shape"); |
| arg.add_ints(N); |
| arg.add_ints(1); |
| arg.add_ints(D); |
| arg.add_ints(1); |
| } |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_reshaped"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLSoftmax"); |
| op.add_input("X_gl"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu0"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Reshape"); |
| op.add_input("Y_cpu0"); |
| op.add_output("Y_cpu"); |
| op.add_output("old_shape"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("shape"); |
| arg.add_ints(N); |
| arg.add_ints(D); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("Softmax"); |
| op.add_input("X_cpu"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL InstanceNorm Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| // for (auto i = 0; i < t->size(); ++i) { |
| // t->mutable_data<float>()[i] = 0.001; |
| // } |
| } |
| |
| // scale |
| { |
| auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>(); |
| t->Resize(C); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = (i + 1) / t->size(); |
| } |
| } |
| // bias |
| { |
| auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>(); |
| t->Resize(C); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = 8 - 2 * i; |
| } |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLInstanceNorm"); |
| op.add_input("X_gl"); |
| op.add_input("W"); |
| op.add_input("b"); |
| op.add_output("Y_gl"); |
| op.add_output("Mean_gl"); |
| op.add_output("InvStdev_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Mean_gl"); |
| op.add_output("Mean_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("InvStdev_gl"); |
| op.add_output("InvStdev_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("InstanceNorm"); |
| op.add_input("X_cpu"); |
| op.add_input("W"); |
| op.add_input("b"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| op.add_output("Mean_ref"); |
| op.add_output("InvStdev_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| LOG(INFO) << "Check mean"; |
| checkError1D( |
| ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001); |
| LOG(INFO) << "Check inv_stdev"; |
| checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(), |
| ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(), |
| 0.001); |
| LOG(INFO) << "Check instance norm"; |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL InstanceNormPRelu Test " |
| << "C: " << C << ", H: " << H << ", W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // Too noisy. |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 30, t->mutable_data<float>(), &ctx); |
| // for (auto i = 0; i < t->size(); ++i) { |
| // t->mutable_data<float>()[i] = 0.001; |
| // } |
| } |
| |
| // scale |
| { |
| auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>(); |
| t->Resize(C); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = (i + 1) / t->size(); |
| } |
| } |
| // bias |
| { |
| auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>(); |
| t->Resize(C); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = 8 - 2 * i; |
| } |
| } |
| // prelu scale |
| { |
| auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>(); |
| t->Resize(C); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLInstanceNormPRelu"); |
| op.add_input("X_gl"); |
| op.add_input("W"); |
| op.add_input("b"); |
| op.add_input("p"); |
| op.add_output("Y_gl"); |
| op.add_output("Mean_gl"); |
| op.add_output("InvStdev_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Mean_gl"); |
| op.add_output("Mean_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("InvStdev_gl"); |
| op.add_output("InvStdev_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("InstanceNorm"); |
| op.add_input("X_cpu"); |
| op.add_input("W"); |
| op.add_input("b"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| op.add_output("Mean_ref"); |
| op.add_output("InvStdev_ref"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("PRelu"); |
| op.add_input("Y_ref"); |
| op.add_input("p"); |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // CPU |
| |
| LOG(INFO) << "Check mean"; |
| checkError1D( |
| ws.GetBlob("Mean_cpu")->Get<TensorCPU>(), ws.GetBlob("Mean_ref")->Get<TensorCPU>(), 0.001); |
| LOG(INFO) << "Check inv_stdev"; |
| checkError1D(ws.GetBlob("InvStdev_cpu")->Get<TensorCPU>(), |
| ws.GetBlob("InvStdev_ref")->Get<TensorCPU>(), |
| 0.001); |
| LOG(INFO) << "Check instance norm"; |
| checkError(ws.GetBlob("Y_cpu")->Get<TensorCPU>(), ws.GetBlob("Y_ref")->Get<TensorCPU>(), error); |
| } |
| |
| void OpenGL_speedtest(int N, |
| int C, |
| int H, |
| int W, |
| int K, |
| int kernel_h, |
| int kernel_w, |
| int pad, |
| float error, |
| bool random_input = true) { |
| LOG(INFO) << "OpenGL Conv Speed Test " |
| << " C: " << C << " H: " << H << " W: " << W; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>(); |
| t->Resize(K, C, kernel_h, kernel_w); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>(); |
| t->Resize(K); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| NetDef netdef; |
| netdef.set_name("Test net"); |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLConv"); |
| op.add_input("X_gl"); |
| op.add_input("W"); |
| op.add_input("b"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("order"); |
| arg.set_s("NCHW"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("kernel"); |
| arg.set_i(kernel_h); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("pad"); |
| arg.set_i(pad); |
| } |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| CAFFE_ENFORCE(ws.RunNetOnce(netdef)); |
| caffe2::NetBase* net = ws.CreateNet(netdef); |
| CHECK_NOTNULL(net); |
| CAFFE_ENFORCE(net->Run()); |
| net->TEST_Benchmark(1, 4, true); |
| } |
| |
| void testOpenGLPadImage(int N, int C, int H, int W, int pad, float error) { |
| LOG(INFO) << "OpenGLPadImage Test"; |
| { |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| // math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), |
| // &ctx); |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = i + 1; |
| } |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLPadImage"); |
| op.add_input("X_gl"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("pad"); |
| arg.set_i(pad); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("mode"); |
| arg.set_s("reflect"); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("is_last"); |
| arg.set_i(1); |
| } |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("PadImage"); |
| op.add_input("X_cpu"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("pad"); |
| arg.set_i(pad); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("mode"); |
| arg.set_s("reflect"); |
| } |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu |
| checkError(t2, t1, error); |
| } |
| } |
| |
| void testOpenGLResize( |
| int N, int C, int H, int W, int width_scale, int height_scale, int batch_size, float error) { |
| LOG(INFO) << "OpenGLResize Test"; |
| { |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } |
| |
| NetDef netdef; |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLResizeNearest"); |
| op.add_input("X_gl"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("width_scale"); |
| arg.set_f(width_scale); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("height_scale"); |
| arg.set_f(height_scale); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("batch_size"); |
| arg.set_i(batch_size); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("is_last"); |
| arg.set_i(1); |
| } |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("ResizeNearest"); |
| op.add_input("X_cpu"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("width_scale"); |
| arg.set_f(width_scale); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("height_scale"); |
| arg.set_f(height_scale); |
| } |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // opengl |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu |
| checkError(t2, t1, error); |
| } |
| } |
| |
| void testOpenGLPreprocess(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGL Preprocess Test"; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, H, W, C); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<uint8_t>()[i] = rand() % 255; |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>(); |
| t->Resize(3); |
| CPUContext ctx; |
| t->mutable_data<float>()[0] = 100; |
| t->mutable_data<float>()[1] = 50; |
| t->mutable_data<float>()[2] = 150; |
| } |
| |
| NetDef netdef; |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLTensorToTextureStylizerPreprocess"); |
| op.add_input("X_cpu"); |
| op.add_input("mean"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("noise_std"); |
| arg.set_f(0.00001); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("noise_size"); |
| arg.set_i(512); |
| } |
| |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("PackedInt8BGRANHWCToNCHWCStylizerPreprocess"); |
| op.add_input("X_cpu"); |
| op.add_input("mean"); |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("noise_std"); |
| arg.set_f(0.00001); |
| } |
| { |
| auto& arg = *(op.add_arg()); |
| arg.set_name("noise_size"); |
| arg.set_i(512); |
| } |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); // openGL |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); // cpu |
| checkError(t2, t1, error); |
| } |
| |
| void testOpenGLDeprocess(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGLDeprocess Test"; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = rand() % 1000 - 500; |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>(); |
| t->Resize(3); |
| CPUContext ctx; |
| t->mutable_data<float>()[0] = 30; |
| t->mutable_data<float>()[1] = 40; |
| t->mutable_data<float>()[2] = 50; |
| } |
| |
| NetDef netdef; |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLTextureToTensorStylizerDeprocess"); |
| op.add_input("X_gl"); |
| op.add_input("mean"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("BRGNCHWCToPackedInt8BGRAStylizerDeprocess"); |
| op.add_input("X_cpu"); |
| op.add_input("mean"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); |
| checkError(t2, t1, error); |
| } |
| |
| void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) { |
| LOG(INFO) << "OpenGLNormPlanarYUV Test"; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, 3, H, W); |
| CPUContext ctx; |
| for (auto i = 0; i < t->size(); ++i) { |
| t->mutable_data<float>()[i] = rand() % 1000 - 500; |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>(); |
| t->Resize(1, 3); |
| CPUContext ctx; |
| t->mutable_data<float>()[0] = 30; |
| t->mutable_data<float>()[1] = 40; |
| t->mutable_data<float>()[2] = 50; |
| } |
| |
| { |
| auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>(); |
| t->Resize(1, 3); |
| CPUContext ctx; |
| t->mutable_data<float>()[0] = 6; |
| t->mutable_data<float>()[1] = 7; |
| t->mutable_data<float>()[2] = 8; |
| } |
| |
| NetDef netdef; |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("OpenGLNormalizePlanarYUV"); |
| op.add_input("X_gl"); |
| op.add_input("mean"); |
| op.add_input("stdev"); |
| op.add_output("Y_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("Y_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("NormalizePlanarYUV"); |
| op.add_input("X_cpu"); |
| op.add_input("mean"); |
| op.add_input("stdev"); |
| op.add_output("Y_ref"); |
| } |
| |
| ws.RunNetOnce(netdef); |
| const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>(); |
| const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>(); |
| checkError(t2, t1, error); |
| } |
| |
| void OpenGL_copyops_speedtest(int N, |
| int C, |
| int H, |
| int W, |
| int K, |
| int kernel_h, |
| int kernel_w, |
| int pad, |
| float error, |
| bool random_input = true) { |
| LOG(INFO) << "OpenGL CopyOps Speed Test"; |
| Workspace ws; |
| { |
| auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>(); |
| t->Resize(N, C, H, W); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>(); |
| t->Resize(K, C, kernel_h, kernel_w); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| { |
| auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>(); |
| t->Resize(K); |
| CPUContext ctx; |
| if (random_input) { |
| math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx); |
| } else { |
| float* data = t->mutable_data<float>(); |
| for (int i = 0; i < t->size(); i++) { |
| data[i] = 1; |
| } |
| } |
| } |
| |
| NetDef netdef; |
| netdef.set_name("Test net"); |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyToOpenGL"); |
| op.add_input("X_cpu"); |
| op.add_output("X_gl"); |
| } |
| |
| { |
| auto& op = *(netdef.add_op()); |
| op.set_type("CopyFromOpenGL"); |
| op.add_input("X_gl"); |
| op.add_output("Y_cpu"); |
| } |
| |
| caffe2::NetBase* net = ws.CreateNet(netdef); |
| CHECK_NOTNULL(net); |
| net->TEST_Benchmark(1, 4, true); |
| } |
| |
| static NetDef truncateAfter(NetDef def, size_t idx) { |
| // idx = 0, net = 10 -> remove 9 |
| // idx = 0, net = 1 -> remove 0 |
| const auto toRemove = def.op_size() - idx - 1; |
| for (auto i = 0; i < toRemove; ++i) { |
| def.mutable_op()->RemoveLast(); |
| } |
| CHECK_EQ(def.op_size(), idx + 1); |
| return def; |
| } |
| |
| void compareModelsForOpenGL(std::string name, |
| const NetDef& initNet, |
| NetDef predictNet, |
| int width, |
| int height, |
| int channel, |
| std::string input_type, |
| std::string input_order) { |
| if (name == "styleTransfer") { |
| auto* arg = predictNet.mutable_op(0)->mutable_arg(1); |
| CHECK_EQ(arg->name(), "noise_std"); |
| arg->set_f(0.000001); |
| } |
| |
| for (auto i = 0; i < predictNet.op_size(); ++i) { |
| auto truncatedPredictNet = truncateAfter(predictNet, i); |
| |
| // Change the last blob to external_output(0) for the predict net |
| auto output_blob = "_OUTPUT_BLOB__"; |
| truncatedPredictNet.set_external_output(0, output_blob); |
| truncatedPredictNet.mutable_op(truncatedPredictNet.op_size() - 1)->set_output(0, output_blob); |
| |
| NetDef truncatedOpenGLPredictNet = rewritePredictNetForOpenGL(truncatedPredictNet); |
| |
| LOG(INFO) << "truncatedPredictNet"; |
| dumpDefForOpenGL(truncatedPredictNet); |
| |
| LOG(INFO) << "truncatedOpenGLPredictNet"; |
| dumpDefForOpenGL(truncatedOpenGLPredictNet); |
| |
| CPUContext ctx; |
| Workspace cws; |
| cws.RunNetOnce(initNet); |
| |
| auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>(); |
| if (name == "styleTransfer") { |
| CAFFE_ENFORCE_EQ(input_order, "NHWC"); |
| CAFFE_ENFORCE_EQ(input_type, "uint8_t"); |
| t_cpu->Resize(1, height, width, channel); |
| for (auto i = 0; i < t_cpu->size(); ++i) { |
| t_cpu->mutable_data<uint8_t>()[i] = i % 255; |
| } |
| } else if (name == "segmentation") { |
| CAFFE_ENFORCE_EQ(input_order, "NCHW"); |
| CAFFE_ENFORCE_EQ(input_type, "float"); |
| t_cpu->Resize(1, channel, height, width); |
| float* input = t_cpu->mutable_data<float>(); |
| const int size = width * height; |
| // Limit input range to YUV |
| math::RandGaussian<float, CPUContext>(size, 0.5, 0.15, input, &ctx); // Y: 0 ~ 1 |
| math::RandGaussian<float, CPUContext>(size, 0, 0.12, input + size, &ctx); // U: -0.436 ~ 0.436 |
| math::RandGaussian<float, CPUContext>( |
| size, 0, 0.2, input + 2 * size, &ctx); // V: -0.615 ~ 0.615 |
| } else { |
| CAFFE_THROW("CompareModels only works with style transfer and segmentation now"); |
| } |
| |
| Workspace mws; |
| mws.RunNetOnce(initNet); |
| |
| auto* t_gl = |
| mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>(); |
| if (name == "styleTransfer") { |
| CAFFE_ENFORCE_EQ(input_order, "NHWC"); |
| CAFFE_ENFORCE_EQ(input_type, "uint8_t"); |
| t_gl->Resize(1, height, width, channel); |
| for (auto i = 0; i < t_gl->size(); ++i) { |
| t_gl->mutable_data<uint8_t>()[i] = i % 255; |
| } |
| } else if (name == "segmentation") { |
| CAFFE_ENFORCE_EQ(input_order, "NCHW"); |
| CAFFE_ENFORCE_EQ(input_type, "float"); |
| t_gl->Resize(1, channel, height, width); |
| float* input = t_gl->mutable_data<float>(); |
| memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes()); |
| } |
| |
| cws.RunNetOnce(truncatedPredictNet); |
| mws.RunNetOnce(truncatedOpenGLPredictNet); |
| |
| const auto m_name = |
| truncatedOpenGLPredictNet.op(truncatedOpenGLPredictNet.op_size() - 1).output(0); |
| const auto c_name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0); |
| |
| LOG(INFO) << "Checking correspondence for name: " << m_name << ", idx: " << i; |
| { |
| const auto& mt = mws.GetBlob(m_name)->Get<TensorCPU>(); // GPU |
| const auto& ct = cws.GetBlob(c_name)->Get<TensorCPU>(); // CPU |
| checkError(mt, ct, 1); |
| } |
| } |
| } |
| |
| int runModelBenchmarks(caffe2::NetDef& init_net, |
| caffe2::NetDef& predict_net, |
| int warm_up_runs, |
| int main_runs, |
| int channel, |
| int height, |
| int width, |
| std::string input_type, |
| std::string input_order, |
| std::string engine, // "CPU", "OPENGL", or "MPSCNN" |
| bool run_individual, |
| bool use_texture_input) { |
| std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace()); |
| |
| // caffe2::dumpDefForOpenGL(init_net); |
| caffe2::dumpDefForOpenGL(predict_net); |
| |
| CAFFE_ENFORCE(workspace->RunNetOnce(init_net)); |
| caffe2::NetDef net_def; |
| |
| // rewrite network |
| if (engine == "CPU") { |
| net_def.CopyFrom(predict_net); |
| } else if (engine == "OPENGL") { |
| if (!caffe2::tryConvertToOpenGL(init_net, predict_net, &net_def, use_texture_input)) { |
| CAFFE_THROW("Failed to convert to openGL. Benchmark failed to run"); |
| return -1; |
| } |
| } else if (engine == "MPSCNN") { |
| #ifdef CAFFE2_USE_MPSCNN |
| if (!caffe2::tryConvertToMPSCNN(init_net, predict_net, &net_def)) { |
| CAFFE_THROW("Failed to convert to MPSCNN. Benchmark failed to run"); |
| return -1; |
| } |
| #else |
| CAFFE_THROW("MPSCNN not enabled. Benchmark failed to run"); |
| return -1; |
| #endif |
| } else { |
| CAFFE_THROW("Unsupported engine. Benchmark failed to run"); |
| return -1; |
| } |
| |
| if (!net_def.has_name()) { |
| net_def.set_name("benchmark"); |
| } |
| caffe2::NetBase* net = workspace->CreateNet(net_def); |
| |
| // create input blob |
| if (engine == "CPU" || engine == "MPSCNN" || !use_texture_input) { |
| caffe2::TensorCPU* b; |
| if (!net_def.external_input_size()) { |
| b = workspace->CreateBlob("data")->GetMutable<caffe2::TensorCPU>(); |
| } else { |
| b = workspace->CreateBlob(net_def.external_input(0))->GetMutable<caffe2::TensorCPU>(); |
| } |
| |
| if (input_order == "NCHW") { |
| b->Resize(std::vector<int32_t>( |
| {1, static_cast<int>(channel), static_cast<int>(height), static_cast<int>(width)})); |
| } else if (input_order == "NHWC") { |
| b->Resize(std::vector<int32_t>( |
| {1, static_cast<int>(height), static_cast<int>(width), static_cast<int>(channel)})); |
| } else { |
| CAFFE_THROW("Unknown input order: ", input_order); |
| } |
| if (input_type == "uint8_t") { |
| b->mutable_data<uint8_t>(); |
| } else if (input_type == "float") { |
| b->mutable_data<float>(); |
| } else { |
| CAFFE_THROW("Unknown input type: ", input_type); |
| } |
| } else { |
| const int tile_x = 1, tile_y = 1; |
| ImageAllocator<uint8_t> allocator; |
| GLImageVector<uint8_t>* output_image = allocator.newImage(1, |
| width, |
| height, |
| channel, |
| tile_x, |
| tile_y, |
| #if CAFFE2_IOS |
| true |
| #else |
| false |
| #endif |
| ); |
| |
| Blob* blob = nullptr; |
| if (!net_def.external_input_size()) { |
| blob = workspace->CreateBlob("data"); |
| } else { |
| blob = workspace->CreateBlob(net_def.external_input(0)); |
| } |
| blob->Reset(output_image); |
| const auto textures = (*output_image)[0]->textures; |
| for (int slice = 0; slice < textures.size(); slice++) { |
| textures[slice]->map_load([&](void* buffer, |
| size_t width, |
| size_t height, |
| size_t stride, |
| size_t channels, |
| const GLTexture::Type& type) {}); |
| } |
| } |
| |
| // run benchmark |
| if (engine == "CPU" || engine == "MPSCNN") { |
| CHECK_NOTNULL(net); |
| CAFFE_ENFORCE(net->Run()); |
| net->TEST_Benchmark(warm_up_runs, main_runs, run_individual); |
| } else if (engine == "OPENGL") { |
| CHECK_NOTNULL(net); |
| CAFFE_ENFORCE(net->Run()); |
| |
| for (int i = 0; i < warm_up_runs; i++) { |
| net->Run(); |
| } |
| glFinish(); |
| |
| Timer timer; |
| timer.Start(); |
| for (int i = 0; i < main_runs; i++) { |
| net->Run(); |
| } |
| if (use_texture_input) { |
| glFinish(); |
| } |
| |
| double iter_time = (double)timer.MilliSeconds() / main_runs; |
| LOG(INFO) << "Main run finished. Milliseconds per iter: " << iter_time |
| << ". Iters per second: " << 1000.0 / iter_time; |
| |
| if (run_individual) { |
| std::vector<std::unique_ptr<caffe2::OperatorBase>> ops; |
| |
| for (auto& op : net_def.op()) { |
| ops.push_back(CreateOperator(op, workspace.get())); |
| ops.back()->Run(); // warm up |
| } |
| |
| for (int k = 0; k < ops.size(); k++) { |
| timer.Start(); |
| for (int i = 0; i < main_runs; i++) { |
| ops[k]->Run(); |
| } |
| glFinish(); |
| |
| LOG(INFO) << net_def.op(k).type() << ": " << (double)timer.MilliSeconds() / main_runs; |
| } |
| } |
| } |
| |
| return 0; |
| } |
| |
| template <typename T> |
| void testGLTextureTypes() { |
| gl_log(GL_LOG, "Executing %s...\n", __PRETTY_FUNCTION__); |
| |
| GLImageAllocator<T>* allocator = GLImageAllocator<T>::newGLImageAllocator(); |
| |
| GLImageVector<T>* image = allocator->newImage(1, 10, 10, 4, 1, 1, true); |
| |
| const GLTexture* texture = (*image)[0]->textures[0]; |
| |
| texture->map_load([&](void* buffer, |
| size_t width, |
| size_t height, |
| size_t stride, |
| size_t channels, |
| const GLTexture::Type& type) { |
| T* buffer_data = (T*)buffer; |
| |
| for (int y = 0; y < height; y++) { |
| for (int x = 0; x < width; x++) { |
| for (int c = 0; c < channels; c++) { |
| buffer_data[channels * (y * stride + x) + c] = x + y; |
| } |
| } |
| } |
| }); |
| |
| texture->map_read([&](const void* buffer, |
| size_t width, |
| size_t height, |
| size_t stride, |
| size_t channels, |
| const GLTexture::Type& type) { |
| const T* buffer_data = (const T*)buffer; |
| |
| for (int y = 0; y < height; y++) { |
| for (int x = 0; x < width; x++) { |
| gl_log(GL_LOG, "%d, ", (int)buffer_data[channels * (y * stride + x) + 0]); |
| } |
| gl_log(GL_LOG, "\n"); |
| } |
| }); |
| delete image; |
| delete allocator; |
| gl_log(GL_LOG, "...done with %s\n", __PRETTY_FUNCTION__); |
| } |
| |
| void squareFactors(int N, int& r1, int& r2) { |
| int f = sqrt(N); |
| |
| if (f * f == N) { |
| r1 = r2 = f; |
| } else { |
| while (N % f != 0) { |
| f--; |
| } |
| r1 = N / f; |
| r2 = f; |
| } |
| } |
| |
| void testOpenGL() { |
| // Test a bunch of different tiled convolutions |
| std::vector<int> channels({4, 8, 16}); |
| |
| for (const auto& input_channels : channels) { |
| int tile_x, tile_y; |
| squareFactors(input_channels / 4, tile_x, tile_y); |
| |
| for (int size = 5; size < 1024; size *= 2) { |
| testOpenGLConv(1, |
| input_channels, |
| size, |
| size, |
| input_channels, |
| 3, |
| 3, |
| 0, |
| 1, |
| Conv, |
| 0.5, |
| true, |
| 1, |
| 1, |
| tile_x, |
| tile_y); |
| } |
| |
| for (int size = 5; size < 1024; size *= 2) { |
| testOpenGLConv(1, |
| input_channels, |
| size, |
| size, |
| input_channels, |
| 3, |
| 3, |
| 0, |
| 1, |
| ConvTranspose, |
| 0.5, |
| true, |
| 1, |
| 1, |
| tile_x, |
| tile_y); |
| } |
| } |
| |
| // Test various paddings and strides with tiled convolution |
| for (int kernel_size = 1; kernel_size <= 5; kernel_size++) { |
| for (int pad = 0; pad < kernel_size; pad++) { |
| for (int stride = 1; stride <= 8; stride++) { |
| testOpenGLConv(1, |
| 16, |
| 100, |
| 100, |
| 16, |
| kernel_size, |
| kernel_size, |
| pad, |
| stride, |
| Conv, |
| 0.5, |
| true, |
| 1, |
| 1, |
| 2, |
| 2); |
| } |
| |
| for (int stride = 1; stride <= 8; stride++) { |
| testOpenGLConv(1, |
| 16, |
| 100, |
| 100, |
| 16, |
| kernel_size, |
| kernel_size, |
| pad, |
| stride, |
| ConvTranspose, |
| 0.5, |
| true, |
| 1, |
| 1, |
| 2, |
| 2); |
| } |
| } |
| } |
| |
| testGLTextureTypes<uint8_t>(); |
| testGLTextureTypes<float16_t>(); |
| |
| testOpenGLCopyOps(1, 4, 4, 4, 1e-2); |
| testOpenGLCopyOps(1, 3, 4, 4, 1e-2); |
| testOpenGLCopyOps(1, 2, 4, 4, 1e-2); |
| testOpenGLCopyOps(1, 1, 4, 4, 1e-2); |
| testOpenGLCopyOps(1, 4, 2, 2, 1e-2); |
| testOpenGLCopyOps(1, 4, 4, 4, 1e-2); |
| testOpenGLCopyOps(1, 4, 1, 1, 1e-2); |
| testOpenGLCopyOps(1, 4, 8, 8, 1e-2); |
| testOpenGLCopyOps(1, 6, 8, 3, 1e-2); |
| testOpenGLCopyOps(1, 4, 1, 2, 1e-2); |
| testOpenGLCopyOps(1, 8, 6, 1, 1e-2); |
| testOpenGLCopyOps(1, 8, 13, 18, 1e-2); |
| testOpenGLCopyOps(1, 16, 13, 18, 1e-2); |
| testOpenGLCopyOps(1, 13, 128, 90, 1e-2); |
| testOpenGLCopyOps(1, 16, 1280, 720, 1e-2); |
| |
| testOpenGLCopyOps(1, 16, 4, 4, 1e-2, 2, 2); |
| testOpenGLCopyOps(1, 64, 16, 16, 1e-2, 2, 2); |
| testOpenGLCopyOps(1, 48, 13, 17, 1e-2, 3, 2); |
| testOpenGLCopyOps(1, 512, 1, 1, 1e-2, 4, 16); |
| testOpenGLCopyOps(1, 256, 7, 7, 1e-2, 8, 8); |
| testOpenGLCopyOps(1, 20, 13, 17, 1e-2, 5, 1); |
| |
| // Test pooling operators |
| LOG(INFO) << "Test pooling operators"; |
| testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true); |
| testOpenGLConv(1, 4, 5, 5, 4, 5, 5, 0, 1, AveragePool, 0.5, true); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, AveragePool, 0.01, true); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, AveragePool, 0.01, true); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, AveragePool, 0.01, true); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, MaxPool, 0.01, true); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, MaxPool, 0.01, true); |
| |
| // Test strided convolution |
| LOG(INFO) << "Test strided convolution"; |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 2, Conv, 0.5, true, 1, 1); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 3, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 1, 3, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 2, 3, Conv, 0.5, true, 1, 1); |
| |
| // Test input batching |
| LOG(INFO) << "Test input batching"; |
| testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1); |
| testOpenGLConv(1, 8, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 2, 1); |
| testOpenGLConv(1, 12, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 3, 1); |
| testOpenGLConv(1, 16, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 4, 1); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input |
| testOpenGLConv(1, 8, 10, 10, 4, 3, 3, 0, 1, Conv, 1, true, 2, 1); // use random input |
| testOpenGLConv(1, 12, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 3, 1); // use random input |
| testOpenGLConv(1, 16, 10, 10, 4, 3, 3, 0, 1, Conv, 2, true, 4, 1); // use random input |
| testOpenGLConv(1, 32, 10, 10, 4, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input |
| |
| // Test output batching |
| LOG(INFO) << "Test output batching"; |
| testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1); |
| testOpenGLConv(1, 4, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 1, 2); |
| testOpenGLConv(1, 4, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 1, 3); |
| testOpenGLConv(1, 4, 5, 5, 16, 3, 3, 0, 1, Conv, 0.5, false, 1, 4); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input |
| testOpenGLConv(1, 4, 10, 10, 8, 3, 3, 0, 1, Conv, 1.5, true, 1, 2); // use random input |
| testOpenGLConv(1, 4, 10, 10, 12, 3, 3, 0, 1, Conv, 0.5, true, 1, 3); // use random input |
| testOpenGLConv(1, 4, 10, 10, 16, 3, 3, 0, 1, Conv, 0.5, true, 1, 4); // use random input |
| |
| // Test both |
| LOG(INFO) << "Test both input and output batching"; |
| testOpenGLConv(1, 4, 5, 5, 4, 3, 3, 0, 1, Conv, 0.5, false, 1, 1); |
| testOpenGLConv(1, 8, 5, 5, 8, 3, 3, 0, 1, Conv, 0.5, false, 2, 2); |
| testOpenGLConv(1, 12, 5, 5, 12, 3, 3, 0, 1, Conv, 0.5, false, 3, 3); |
| |
| testOpenGLConv(1, 4, 10, 10, 4, 3, 3, 0, 1, Conv, 0.5, true, 1, 1); // use random input |
| testOpenGLConv(1, 8, 10, 10, 8, 3, 3, 0, 1, Conv, 1, true, 2, 2); // use random input |
| testOpenGLConv(1, 12, 10, 10, 12, 3, 3, 0, 1, Conv, 2, true, 3, 3); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input |
| |
| // Test different combination of batching |
| LOG(INFO) << "Test mixed input and output batching sizes"; |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 2); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 2); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 4); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 4); |
| |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 1, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 2, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, false, 4, 2); |
| |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2); // use random input |
| |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); |
| testOpenGLConv(1, 16, 3, 3, 16, 3, 3, 0, 1, Conv, 4, true, 4, 2); |
| |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 2); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 2, 2); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 4, 1); // use random input |
| testOpenGLConv(1, 16, 10, 10, 16, 3, 3, 0, 1, Conv, 4, true, 1, 4); // use random input |
| |
| // Test input/output channels |
| for (int i = 0; i < 4; i++) { |
| testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input |
| testOpenGLConv(1, 6, 10, 10, i, 3, 3, 0, 1, Conv, 4, true, 2, 1); // use random input |
| } |
| |
| // Test large input size |
| LOG(INFO) << "Test large input size"; |
| testOpenGLConv(1, 4, 1280, 720, 4, 3, 3, 0, 1, Conv, 1, true, 1, 1); // use random input |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input |
| |
| // Test non standard input size |
| testOpenGLConv(1, 16, 1285, 723, 16, 3, 3, 0, 1, Conv, 4, true, 1, 1); // use random input |
| testOpenGLConv(1, 16, 1277, 715, 16, 3, 3, 0, 1, Conv, 4, true, 4, 4); // use random input |
| |
| // Test for different kernel size |
| LOG(INFO) << "Test kernel sizes 4 to 6"; |
| for (int w = 4; w < 7; w++) { |
| testOpenGLConv(1, 4, 1280, 720, 4, w, w, 0, 1, Conv, 4 * (w / 3.0) * (w / 3.0), true, 1, 1); |
| |
| testOpenGLConv(1, 4, 1285, 723, 4, w, w, 0, 1, Conv, 4 * (w / 3.0) * (w / 3.0), true, 1, 1); |
| } |
| |
| // Test a bunch of Transposed Convolutions |
| for (int kernel_size = 1; kernel_size <= 8; kernel_size++) { |
| for (int stride = 1; stride <= 8; stride++) { |
| testOpenGLConv(1, |
| 4, |
| 10, |
| 10, |
| 4, |
| kernel_size, |
| kernel_size, |
| 0, |
| stride, |
| ConvTranspose, |
| 0.5 * (1 + kernel_size / 3.0), |
| true, |
| 1, |
| 1); |
| } |
| } |
| |
| // Test for random failures |
| for (int i = 0; i < 10; i++) { |
| testOpenGLConv(1, 6, 111, 111, 3, 3, 3, 0, 2, ConvTranspose, 0.5, true, 2, 1); |
| testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2); |
| } |
| |
| LOG(INFO) << "Test OpenGL ConvPRelu"; |
| testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1); |
| testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvPRelu, 1, true, 1, 1); |
| testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvPRelu, 2, true, 2, 2); |
| testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvPRelu, 4, true, 3, 1); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvPRelu, 4, true, 1, 1); |
| |
| LOG(INFO) << "Test OpenGL ConvTransposePRelu"; |
| testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1); |
| testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposePRelu, 1, true, 1, 1); |
| testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 2, 2); |
| testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 3, 1); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposePRelu, 4, true, 1, 1); |
| |
| LOG(INFO) << "Test OpenGL ConvRelu"; |
| testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvRelu, 2, true, 1, 1); |
| testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvRelu, 1, true, 1, 1); |
| testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvRelu, 2, true, 2, 2); |
| testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvRelu, 4, true, 3, 1); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvRelu, 4, true, 1, 1); |
| |
| LOG(INFO) << "Test OpenGL ConvTransposeRelu"; |
| testOpenGLConv(1, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 1, 1); |
| testOpenGLConv(1, 4, 6, 6, 4, 3, 3, 0, 1, ConvTransposeRelu, 1, true, 1, 1); |
| testOpenGLConv(1, 8, 6, 6, 8, 3, 3, 0, 1, ConvTransposeRelu, 2, true, 2, 2); |
| testOpenGLConv(1, 16, 16, 16, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 12, 16, 16, 8, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 3, 1); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 4, 4); |
| testOpenGLConv(1, 16, 1280, 720, 16, 3, 3, 0, 1, ConvTransposeRelu, 4, true, 1, 1); |
| |
| LOG(INFO) << "Test OpenGL PRelu"; |
| testOpenGLPRelu(1, 4, 16, 16, 4, 0.1); |
| testOpenGLPRelu(1, 4, 16, 16, 1, 0.1); |
| testOpenGLPRelu(1, 6, 640, 360, 6, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Relu"; |
| testOpenGLRelu(1, 4, 16, 16, 0.1); |
| testOpenGLRelu(1, 4, 16, 16, 0.1); |
| testOpenGLRelu(1, 6, 640, 360, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Add"; |
| testOpenGLAdd(1, 16, 640, 360, 0.1); |
| testOpenGLAdd(1, 16, 640, 360, 0.1); |
| testOpenGLAdd(1, 16, 640, 360, 0.1); |
| testOpenGLAdd(1, 12, 640, 360, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Sigmoid"; |
| testOpenGLSigmoid(1, 4, 16, 16, 0.1); |
| testOpenGLSigmoid(1, 12, 64, 48, 0.1); |
| testOpenGLSigmoid(1, 6, 640, 360, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Tanh"; |
| testOpenGLTanh(1, 4, 16, 16, 0.1); |
| testOpenGLTanh(1, 12, 64, 48, 0.1); |
| testOpenGLTanh(1, 6, 640, 360, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Mul"; |
| testOpenGLMul(1, 4, 16, 16, 0.1); |
| testOpenGLMul(1, 12, 64, 48, 0.1); |
| testOpenGLMul(1, 6, 640, 360, 0.1); |
| |
| LOG(INFO) << "Test OpenGL Concat"; |
| testOpenGLConcat(1, std::vector<int>{4, 4}, 16, 16); |
| testOpenGLConcat(1, std::vector<int>{4, 4, 4}, 16, 16); |
| testOpenGLConcat(1, std::vector<int>{4, 4, 4, 4}, 16, 16); |
| testOpenGLConcat(1, std::vector<int>{8, 4, 12}, 16, 16); |
| testOpenGLConcat(1, std::vector<int>{12, 16, 8}, 16, 16); |
| testOpenGLConcat(1, std::vector<int>{60, 24, 36}, 16, 16); |
| |
| LOG(INFO) << "Test OpenGL Softmax"; |
| testOpenGLSoftmax(1, 100, 0.1); |
| testOpenGLSoftmax(1, 1000, 0.1); |
| testOpenGLSoftmax(1, 10000, 0.1); |
| |
| LOG(INFO) << "Test OpenGL InstanceNorm"; |
| testOpenGLInstanceNorm(1, 4, 16, 16, 0.2); |
| testOpenGLInstanceNorm(1, 4, 20, 20, 0.2); |
| testOpenGLInstanceNorm(1, 4, 128, 128, 0.2); |
| testOpenGLInstanceNorm(1, 12, 120, 140, 0.3); |
| testOpenGLInstanceNorm(1, 3, 120, 140, 0.2); |
| testOpenGLInstanceNorm(1, 4, 192, 192, 0.2); |
| |
| testOpenGLInstanceNorm(1, 4, 258, 198, 0.2); |
| testOpenGLInstanceNorm(1, 8, 338, 198, 0.2); |
| testOpenGLInstanceNorm(1, 12, 334, 194, 0.2); |
| testOpenGLInstanceNorm(1, 16, 324, 184, 0.2); |
| testOpenGLInstanceNorm(1, 6, 640, 360, 0.2); |
| |
| LOG(INFO) << "Test OpenGL InstanceNormPRelu"; |
| testOpenGLInstanceNormPRelu(1, 4, 16, 16, 0.2); |
| testOpenGLInstanceNormPRelu(1, 4, 20, 20, 0.2); |
| testOpenGLInstanceNormPRelu(1, 4, 128, 128, 0.2); |
| testOpenGLInstanceNormPRelu(1, 12, 120, 140, 0.3); |
| testOpenGLInstanceNormPRelu(1, 3, 120, 140, 0.2); |
| testOpenGLInstanceNormPRelu(1, 4, 192, 192, 0.2); |
| |
| testOpenGLInstanceNormPRelu(1, 4, 258, 198, 0.2); |
| testOpenGLInstanceNormPRelu(1, 8, 338, 198, 0.2); |
| testOpenGLInstanceNormPRelu(1, 12, 334, 194, 0.2); |
| testOpenGLInstanceNormPRelu(1, 16, 324, 184, 0.2); |
| testOpenGLInstanceNormPRelu(1, 6, 640, 360, 0.2); |
| |
| LOG(INFO) << "Test OpenGL ResizeNearest"; |
| testOpenGLResize(1, 4, 16, 16, 1, 1, 1, 0.1); |
| testOpenGLResize(1, 4, 16, 16, 2, 2, 1, 0.1); |
| testOpenGLResize(1, 4, 16, 16, 3, 3, 1, 0.1); |
| testOpenGLResize(1, 4, 16, 16, 4, 4, 1, 0.1); |
| testOpenGLResize(1, 16, 25, 25, 3, 3, 2, 0.1); |
| testOpenGLResize(1, 16, 25, 25, 3, 3, 4, 0.1); |
| testOpenGLResize(1, 12, 25, 25, 3, 3, 3, 0.1); |
| testOpenGLResize(1, 4, 720, 1280, 3, 3, 1, 0.1); |
| |
| // debug style transfer |
| // conv |
| testOpenGLConv(1, 3, 82, 82, 8, 9, 9, 0, 1, Conv, 4, true, 1, 1); |
| testOpenGLConv(1, 8, 74, 74, 8, 3, 3, 0, 1, Conv, 4, true, 1, 1); |
| testOpenGLConv(1, 8, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1); |
| testOpenGLConv(1, 12, 82, 82, 12, 3, 3, 0, 1, Conv, 4, true, 1, 1); |
| |
| // convtranspose |
| testOpenGLConv(1, 16, 56, 56, 6, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 2); |
| testOpenGLConv(1, 6, 112, 112, 3, 4, 4, 0, 2, ConvTranspose, 0.5, true, 2, 1); |
| |
| LOG(INFO) << "Test OpenGL PadImage"; |
| testOpenGLPadImage(1, 3, 4, 4, 2, 0.01); |
| testOpenGLPadImage(1, 3, 50, 80, 10, 0.01); |
| testOpenGLPadImage(1, 12, 50, 80, 10, 0.01); |
| |
| LOG(INFO) << "Test OpenGL Preprocess"; |
| testOpenGLPreprocess(1, 4, 8, 8, 0.20); |
| testOpenGLPreprocess(1, 4, 1280, 720, 0.20); |
| |
| LOG(INFO) << "Test OpenGL Deprocess"; |
| testOpenGLDeprocess(1, 3, 8, 8, 0.01); |
| testOpenGLDeprocess(1, 3, 1280, 720, 0.01); |
| |
| LOG(INFO) << "Test OpenGL NormalizePlanarYUV"; |
| testOpenGLNormPlanarYUV(1, 3, 8, 8, 0.01); |
| testOpenGLNormPlanarYUV(1, 3, 192, 192, 0.01); |
| |
| // for (int i = 0; i < 4; i += 1) { |
| // LOG(INFO) << "C: " << 4 << ", H: " << 1280 + i << ", W: " << 720 + i; |
| // OpenGL_copyops_speedtest(1, 4, 1280, 720 + i, 4, 3, 3, 0, 0.5); |
| // } |
| |
| // for (int i = 0; i < 1; i += 1) { |
| // LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i; |
| // OpenGL_copyops_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5); |
| // } |
| // |
| // for (int i = 0; i < 9; i += 1) { |
| // LOG(INFO) << "C: " << 16 << ", H: " << 1280 + i << ", W: " << 720 + i; |
| // OpenGL_speedtest(1, 16, 1280, 720 + i, 16, 3, 3, 0, 0.5); |
| // } |
| |
| // Multi-Batch Tests |
| LOG(INFO) << "Test OpenGL Multi-batch Support"; |
| testOpenGLCopyOps(2, 4, 4, 4, 1e-2); |
| testOpenGLCopyOps(3, 4, 4, 4, 1e-2); |
| testOpenGLCopyOps(5, 4, 4, 4, 1e-2); |
| testOpenGLConv(2, 4, 5, 5, 4, 3, 3, 0, 1, AveragePool, 0.01, true); |
| testOpenGLConv(2, 4, 10, 10, 4, 3, 3, 0, 2, MaxPool, 0.01, true); |
| testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(5, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(7, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(11, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(12, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(21, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(50, 4, 10, 10, 4, 3, 3, 0, 2, Conv, 0.5, true, 1, 1); |
| testOpenGLConv(3, 4, 10, 10, 4, 3, 3, 0, 2, ConvTranspose, 0.5, true, 1, 1); |
| testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvPRelu, 2, true, 1, 1); |
| testOpenGLConv(3, 16, 6, 6, 16, 3, 3, 0, 1, ConvTransposePRelu, 2, true, 1, 1); |
| |
| testOpenGLPRelu(3, 4, 16, 16, 4, 0.1); |
| testOpenGLPRelu(5, 4, 16, 16, 4, 0.1); |
| |
| testOpenGLRelu(3, 4, 16, 16, 0.1); |
| testOpenGLRelu(7, 4, 16, 16, 0.1); |
| |
| testOpenGLAdd(3, 16, 640, 360, 0.1); |
| testOpenGLAdd(9, 16, 640, 360, 0.1); |
| |
| testOpenGLSigmoid(3, 4, 16, 16, 0.1); |
| testOpenGLSigmoid(11, 4, 16, 16, 0.1); |
| |
| testOpenGLInstanceNorm(3, 4, 16, 16, 0.2); |
| testOpenGLInstanceNorm(13, 4, 16, 16, 0.2); |
| |
| testOpenGLInstanceNormPRelu(3, 4, 16, 16, 0.2); |
| testOpenGLInstanceNormPRelu(15, 4, 16, 16, 0.2); |
| |
| testOpenGLResize(3, 4, 16, 16, 1, 1, 1, 0.1); |
| testOpenGLResize(16, 4, 16, 16, 1, 1, 1, 0.1); |
| |
| testOpenGLPadImage(3, 3, 4, 4, 2, 0.01); |
| testOpenGLPadImage(23, 3, 4, 4, 2, 0.01); |
| |
| testOpenGLSoftmax(3, 1000, 0.1); |
| testOpenGLSoftmax(27, 100, 0.1); |
| |
| testOpenGLNormPlanarYUV(4, 3, 192, 192, 0.01); |
| |
| LOG(INFO) << "End of OpenGL tests"; |
| } |
| } // namespace caffe2 |