Adding initial_accumulator_value parameter to Adagrad (#6616)
diff --git a/test/test_optim.py b/test/test_optim.py
index 736d1b5..8467add 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -349,6 +349,10 @@
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
)
self._test_basic_cases(
+ lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1,
+ initial_accumulator_value=0.1)
+ )
+ self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 9f103ab..b39312d 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -19,22 +19,25 @@
Optimization: http://jmlr.org/papers/v12/duchi11a.html
"""
- def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0):
+ def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= lr_decay:
raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+ if not 0.0 <= initial_accumulator_value:
+ raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
- defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay)
+ defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay,
+ initial_accumulator_value=initial_accumulator_value)
super(Adagrad, self).__init__(params, defaults)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
- state['sum'] = torch.zeros_like(p.data)
+ state['sum'] = torch.full_like(p.data, initial_accumulator_value)
def share_memory(self):
for group in self.param_groups: