Add BiasCHW fallback for GPU (#7738)

diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index 0ee509a..dd35678 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -102,8 +102,8 @@
       // Bias term
       if (InputSize() == 3) {
         const T* bias_data = Input(BIAS).template data<T>();
-#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
         const T* bm_data = bias_multiplier_.template data<T>();
+#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
         math::Gemm<T, Context>(
             CblasNoTrans,
             CblasNoTrans,
@@ -119,6 +119,7 @@
 #else
         math::BiasCHW<T, Context>(
             bias_data,
+            bm_data,
             C,
             output_image_size,
             Ydata,
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index e508b37..bd0b2d6 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -488,6 +488,7 @@
 template <typename T, class Context>
 void BiasCHW(
   const T* bias,
+  const T* bias_multiplier,
   const int bias_channels,
   const int image_size,
   T* image,
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 13d65a9..140471a 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -1675,6 +1675,7 @@
 template <>
 void BiasCHW<float, CPUContext>(
     const float* bias,
+    const float* /*bias_multiplier*/,
     const int bias_channels,
     const int image_size,
     float* image,
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 3c0924a..ca05238a 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -272,6 +272,28 @@
 }
 
 template <>
+void BiasCHW<float, CUDAContext>(
+    const float* bias,
+    const float* bias_multiplier,
+    const int bias_channels,
+    const int image_size,
+    float* image,
+    CUDAContext* context) {
+  Gemm<float, CUDAContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      bias_channels,
+      image_size,
+      1,
+      1,
+      bias,
+      bias_multiplier,
+      1,
+      image,
+      context);
+}
+
+template <>
 void GemmBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,