Enable testing the GPU implementations of Adagrad and Adam

Summary:
Enable testing the GPU implementations of Adagrad and Adam incl sparse versions.
Closes https://github.com/caffe2/caffe2/pull/607

Reviewed By: dzhulgakov

Differential Revision: D5121552

Pulled By: Yangqing

fbshipit-source-id: da6b7dde456237c94cf74d00860e7327b2267eab
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 10ffddc..358b889 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -599,154 +599,6 @@
 
     # Reference
     @staticmethod
-    def _dense_adagrad(epsilon, w, h, grad, lr):
-        lr = lr[0]
-        h_o = h + np.square(grad)
-        grad_o = lr * grad / (np.sqrt(h_o) + epsilon)
-        w_o = w + grad_o
-        return (w_o, h_o)
-
-    # Reference
-    @staticmethod
-    def _dense_adam(epsilon, beta1, beta2, w, m1, m2, grad, lr, iters):
-        lr = lr[0]
-        iters = iters[0]
-        t = iters + 1
-        corrected_local_rate = lr * np.sqrt(1. - np.power(beta2, t)) / \
-            (1. - np.power(beta1, t))
-
-        m1_o = (beta1 * m1) + (1. - beta1) * grad
-        m2_o = (beta2 * m2) + (1. - beta2) * np.square(grad)
-        grad_o = corrected_local_rate * m1_o / \
-            (np.sqrt(m2_o) + epsilon)
-        w_o = w + grad_o
-        return (w_o, m1_o, m2_o)
-
-    @given(inputs=hu.tensors(n=3),
-           in_place=st.booleans(),
-           lr=st.floats(min_value=0.1, max_value=0.9),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    def test_adagrad_sgd(self, inputs, in_place, lr, epsilon, engine,
-                         gc, dc):
-        w, grad, h = inputs
-        h = np.abs(h) + 0.01
-        lr = np.asarray([lr], dtype=np.float32)
-        op = core.CreateOperator(
-            "Adagrad",
-            ["w", "h", "grad", "lr"],
-            ["w" if in_place else "grad_o",
-             "h" if in_place else "h_o"],
-            epsilon=epsilon, engine=engine, device_option=gc)
-        self.assertDeviceChecks(dc, op, [w, h, grad, lr], [0])
-
-        self.assertReferenceChecks(gc, op, [w, h, grad, lr],
-                                   partial(self._dense_adagrad, epsilon))
-
-    @given(inputs=hu.tensors(n=3),
-           lr=st.floats(min_value=0.1, max_value=0.9),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    def test_sparse_adagrad_sgd(self, inputs, lr, epsilon,
-                                engine, gc, dc):
-        w, grad, h = inputs
-        indices = np.arange(h.shape[0])
-        indices = indices[indices % 2 == 0]
-        grad = grad[indices]
-        h = np.abs(h)
-        lr = np.asarray([lr], dtype=np.float32)
-        op = core.CreateOperator(
-            "SparseAdagrad",
-            ["param", "h", "indices", "grad", "lr"],
-            ["param", "h"],
-            epsilon=epsilon,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [w, h, indices, grad, lr], [0])
-
-        def adagrad(param, h, i, grad, lr):
-            sw, sh = self._dense_adagrad(epsilon, param[i], h[i], grad, lr)
-            h[i] = sh
-            param[i] = sw
-            return (param, h)
-
-        self.assertReferenceChecks(gc, op, [w, h, indices, grad, lr], adagrad)
-
-    @given(inputs=hu.tensors(n=4),
-           in_place=st.booleans(),
-           beta1=st.floats(min_value=0.1, max_value=0.9),
-           beta2=st.floats(min_value=0.1, max_value=0.9),
-           lr=st.floats(min_value=0.1, max_value=0.9),
-           iters=st.integers(min_value=1, max_value=10000),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           **hu.gcs_cpu_only)
-    def test_adam_sgd(self, inputs, in_place, beta1, beta2, lr, iters, epsilon,
-                      gc, dc):
-        w, grad, m1, m2 = inputs
-        m2 += np.abs(m2) + 0.01
-        lr = np.asarray([lr], dtype=np.float32)
-        iters = np.asarray([iters], dtype=np.int64)
-
-        op = core.CreateOperator(
-            "Adam",
-            ["w", "m1", "m2", "grad", "lr", "iters"],
-            ["w" if in_place else "w_o",
-             "m1" if in_place else "m1_o",
-             "m2" if in_place else "m2_o"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon,
-            device_option=gc)
-        input_device_options = {"iters": hu.cpu_do}
-        inputs = [w, m1, m2, grad, lr, iters]
-        self.assertDeviceChecks(
-            dc, op, inputs, [0], input_device_options=input_device_options)
-
-        self.assertReferenceChecks(gc, op, inputs, partial(self._dense_adam,
-                                   epsilon, beta1, beta2),
-                                   input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=4),
-           beta1=st.floats(min_value=0.1, max_value=0.9),
-           beta2=st.floats(min_value=0.1, max_value=0.9),
-           lr=st.floats(min_value=0.1, max_value=0.9),
-           iters=st.integers(min_value=1, max_value=10000),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           **hu.gcs_cpu_only)
-    def test_sparse_adam_sgd(self, inputs, beta1, beta2, lr, iters,
-                             epsilon, gc, dc):
-
-        w, grad, m1, m2 = inputs
-        indices = np.arange(m1.shape[0])
-        indices = indices[indices % 2 == 0]
-        grad = grad[indices]
-        m2 += np.abs(m2) + 0.01
-        lr = np.asarray([lr], dtype=np.float32)
-        iters = np.asarray([iters], dtype=np.int64)
-        op = core.CreateOperator(
-            "SparseAdam",
-            ["w", "m1", "m2", "indices", "grad", "lr", "iters"],
-            ["w", "m1", "m2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon,
-            device_option=gc)
-        input_device_options = {"iters": hu.cpu_do}
-        inputs = [w, m1, m2, indices, grad, lr, iters]
-        self.assertDeviceChecks(
-            dc, op, inputs, [0], input_device_options=input_device_options)
-
-        def adam(w, m1, m2, i, grad, lr, iters):
-            nw, nm1, nm2 = self._dense_adam(epsilon, beta1, beta2, w[i],
-                                            m1[i], m2[i], grad, lr, iters)
-            w[i] = nw
-            m1[i] = nm1
-            m2[i] = nm2
-            return (w, m1, m2)
-
-        self.assertReferenceChecks(gc, op, inputs, adam)
-
-    # Reference
-    @staticmethod
     def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
         if isinstance(alpha, np.ndarray):
             alpha = np.asscalar(alpha)