Enable testing the GPU implementations of Adagrad and Adam
Summary:
Enable testing the GPU implementations of Adagrad and Adam incl sparse versions.
Closes https://github.com/caffe2/caffe2/pull/607
Reviewed By: dzhulgakov
Differential Revision: D5121552
Pulled By: Yangqing
fbshipit-source-id: da6b7dde456237c94cf74d00860e7327b2267eab
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 10ffddc..358b889 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -599,154 +599,6 @@
# Reference
@staticmethod
- def _dense_adagrad(epsilon, w, h, grad, lr):
- lr = lr[0]
- h_o = h + np.square(grad)
- grad_o = lr * grad / (np.sqrt(h_o) + epsilon)
- w_o = w + grad_o
- return (w_o, h_o)
-
- # Reference
- @staticmethod
- def _dense_adam(epsilon, beta1, beta2, w, m1, m2, grad, lr, iters):
- lr = lr[0]
- iters = iters[0]
- t = iters + 1
- corrected_local_rate = lr * np.sqrt(1. - np.power(beta2, t)) / \
- (1. - np.power(beta1, t))
-
- m1_o = (beta1 * m1) + (1. - beta1) * grad
- m2_o = (beta2 * m2) + (1. - beta2) * np.square(grad)
- grad_o = corrected_local_rate * m1_o / \
- (np.sqrt(m2_o) + epsilon)
- w_o = w + grad_o
- return (w_o, m1_o, m2_o)
-
- @given(inputs=hu.tensors(n=3),
- in_place=st.booleans(),
- lr=st.floats(min_value=0.1, max_value=0.9),
- epsilon=st.floats(min_value=1e-5, max_value=1e-2),
- engine=st.sampled_from([None, "SIMD"]),
- **hu.gcs_cpu_only)
- def test_adagrad_sgd(self, inputs, in_place, lr, epsilon, engine,
- gc, dc):
- w, grad, h = inputs
- h = np.abs(h) + 0.01
- lr = np.asarray([lr], dtype=np.float32)
- op = core.CreateOperator(
- "Adagrad",
- ["w", "h", "grad", "lr"],
- ["w" if in_place else "grad_o",
- "h" if in_place else "h_o"],
- epsilon=epsilon, engine=engine, device_option=gc)
- self.assertDeviceChecks(dc, op, [w, h, grad, lr], [0])
-
- self.assertReferenceChecks(gc, op, [w, h, grad, lr],
- partial(self._dense_adagrad, epsilon))
-
- @given(inputs=hu.tensors(n=3),
- lr=st.floats(min_value=0.1, max_value=0.9),
- epsilon=st.floats(min_value=1e-5, max_value=1e-2),
- engine=st.sampled_from([None, "SIMD"]),
- **hu.gcs_cpu_only)
- def test_sparse_adagrad_sgd(self, inputs, lr, epsilon,
- engine, gc, dc):
- w, grad, h = inputs
- indices = np.arange(h.shape[0])
- indices = indices[indices % 2 == 0]
- grad = grad[indices]
- h = np.abs(h)
- lr = np.asarray([lr], dtype=np.float32)
- op = core.CreateOperator(
- "SparseAdagrad",
- ["param", "h", "indices", "grad", "lr"],
- ["param", "h"],
- epsilon=epsilon,
- engine=engine,
- device_option=gc)
- self.assertDeviceChecks(
- dc, op, [w, h, indices, grad, lr], [0])
-
- def adagrad(param, h, i, grad, lr):
- sw, sh = self._dense_adagrad(epsilon, param[i], h[i], grad, lr)
- h[i] = sh
- param[i] = sw
- return (param, h)
-
- self.assertReferenceChecks(gc, op, [w, h, indices, grad, lr], adagrad)
-
- @given(inputs=hu.tensors(n=4),
- in_place=st.booleans(),
- beta1=st.floats(min_value=0.1, max_value=0.9),
- beta2=st.floats(min_value=0.1, max_value=0.9),
- lr=st.floats(min_value=0.1, max_value=0.9),
- iters=st.integers(min_value=1, max_value=10000),
- epsilon=st.floats(min_value=1e-5, max_value=1e-2),
- **hu.gcs_cpu_only)
- def test_adam_sgd(self, inputs, in_place, beta1, beta2, lr, iters, epsilon,
- gc, dc):
- w, grad, m1, m2 = inputs
- m2 += np.abs(m2) + 0.01
- lr = np.asarray([lr], dtype=np.float32)
- iters = np.asarray([iters], dtype=np.int64)
-
- op = core.CreateOperator(
- "Adam",
- ["w", "m1", "m2", "grad", "lr", "iters"],
- ["w" if in_place else "w_o",
- "m1" if in_place else "m1_o",
- "m2" if in_place else "m2_o"],
- beta1=beta1, beta2=beta2, epsilon=epsilon,
- device_option=gc)
- input_device_options = {"iters": hu.cpu_do}
- inputs = [w, m1, m2, grad, lr, iters]
- self.assertDeviceChecks(
- dc, op, inputs, [0], input_device_options=input_device_options)
-
- self.assertReferenceChecks(gc, op, inputs, partial(self._dense_adam,
- epsilon, beta1, beta2),
- input_device_options=input_device_options)
-
- @given(inputs=hu.tensors(n=4),
- beta1=st.floats(min_value=0.1, max_value=0.9),
- beta2=st.floats(min_value=0.1, max_value=0.9),
- lr=st.floats(min_value=0.1, max_value=0.9),
- iters=st.integers(min_value=1, max_value=10000),
- epsilon=st.floats(min_value=1e-5, max_value=1e-2),
- **hu.gcs_cpu_only)
- def test_sparse_adam_sgd(self, inputs, beta1, beta2, lr, iters,
- epsilon, gc, dc):
-
- w, grad, m1, m2 = inputs
- indices = np.arange(m1.shape[0])
- indices = indices[indices % 2 == 0]
- grad = grad[indices]
- m2 += np.abs(m2) + 0.01
- lr = np.asarray([lr], dtype=np.float32)
- iters = np.asarray([iters], dtype=np.int64)
- op = core.CreateOperator(
- "SparseAdam",
- ["w", "m1", "m2", "indices", "grad", "lr", "iters"],
- ["w", "m1", "m2"],
- beta1=beta1, beta2=beta2, epsilon=epsilon,
- device_option=gc)
- input_device_options = {"iters": hu.cpu_do}
- inputs = [w, m1, m2, indices, grad, lr, iters]
- self.assertDeviceChecks(
- dc, op, inputs, [0], input_device_options=input_device_options)
-
- def adam(w, m1, m2, i, grad, lr, iters):
- nw, nm1, nm2 = self._dense_adam(epsilon, beta1, beta2, w[i],
- m1[i], m2[i], grad, lr, iters)
- w[i] = nw
- m1[i] = nm1
- m2[i] = nm2
- return (w, m1, m2)
-
- self.assertReferenceChecks(gc, op, inputs, adam)
-
- # Reference
- @staticmethod
def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
if isinstance(alpha, np.ndarray):
alpha = np.asscalar(alpha)